Exemplo n.º 1
0
    def test_combine_per_returns_sensible_result(self):
        with TestPipeline() as pipeline:
            # Arrange
            col = [(f"{u}", "pk1", 100.0) for u in range(30)]
            col += [(f"{u + 30}", "pk1", -100.0) for u in range(30)]
            pcol = pipeline | 'Create produce' >> beam.Create(col)
            # Use very high epsilon and delta to minimize noise and test
            # flakiness.
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=800, total_delta=0.999)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=lambda x: x[0]))

            private_collection = private_collection | private_beam.Map(
                lambda x: (x[1], x[2]))

            # Act
            result = private_collection | private_beam.CombinePerKey(
                SumCombineFn(),
                private_beam.CombinePerKeyParams(
                    max_partitions_contributed=2,
                    max_contributions_per_partition=1))
            budget_accountant.compute_budgets()

            # Assert
            # This is a health check to validate that the result is sensible.
            # Hence, we use a very large tolerance to reduce test flakiness.
            beam_util.assert_that(
                result,
                beam_util.equal_to([("pk1", 0.0)],
                                   equals_fn=lambda e, a: PrivateBeamTest.
                                   value_per_key_within_tolerance(e, a, 10.0)))
Exemplo n.º 2
0
    def test_select_partitions_calls_select_partitions_with_params(
            self, mock_select_partitions):
        runner = fn_api_runner.FnApiRunner()
        with beam.Pipeline(runner=runner) as pipeline:
            # Arrange
            pcol = pipeline | 'Create produce' >> beam.Create(
                [1, 2, 3, 4, 5, 6])
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=1, total_delta=0.01)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=PrivateBeamTest.privacy_id_extractor))

            select_partitions_params = \
                aggregate_params.SelectPartitionsParams(
                    max_partitions_contributed=2,
                    budget_weight=0.5)
            partition_extractor = lambda x: f"pk:{x // 10}"

            # Act
            transformer = private_beam.SelectPartitions(
                select_partitions_params=select_partitions_params,
                partition_extractor=partition_extractor,
                label="Test select partitions")
            private_collection | transformer

            # Assert
            self.assertEqual(transformer._budget_accountant, budget_accountant)
            mock_select_partitions.assert_called_once()

            args = mock_select_partitions.call_args[0]
            self.assertEqual(args[1], select_partitions_params)
Exemplo n.º 3
0
    def test_select_private_partitions_returns_sensible_result(self):
        with TestPipeline() as pipeline:
            # Arrange
            col = [(u, "pk1") for u in range(50)]
            col += [(50 + u, "pk2") for u in range(50)]
            pcol = pipeline | 'Create produce' >> beam.Create(col)
            # Use very high epsilon and delta to minimize noise and test
            # flakiness.
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=800, total_delta=0.999)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=lambda x: x[0]))

            select_partitions_params = \
                aggregate_params.SelectPartitionsParams(
                    max_partitions_contributed=2,
                    budget_weight=0.9)
            partition_extractor = lambda x: x[1]

            # Act
            result = private_collection | private_beam.SelectPartitions(
                select_partitions_params=select_partitions_params,
                partition_extractor=partition_extractor,
                label="Test select partitions")
            budget_accountant.compute_budgets()

            # Assert
            # This is a health check to validate that the result is sensible.
            # Hence, we use a very large tolerance to reduce test flakiness.
            beam_util.assert_that(result, beam_util.equal_to(["pk1", "pk2"]))
Exemplo n.º 4
0
    def test_map_returns_correct_results_and_accountant(self):
        runner = fn_api_runner.FnApiRunner()
        with beam.Pipeline(runner=runner) as pipeline:
            # Arrange
            pcol_input = [(1, 2), (2, 3), (3, 4), (4, 5)]
            pcol = pipeline | 'Create produce' >> beam.Create(pcol_input)
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=1, total_delta=0.01)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=PrivateBeamTest.privacy_id_extractor))

            # Act
            transformed = private_collection | private_beam.Map(
                fn=lambda x: x[1]**2)

            # Assert
            self.assertIsInstance(transformed, private_beam.PrivatePCollection)
            beam_util.assert_that(
                transformed._pcol,
                beam_util.equal_to(
                    map(
                        lambda x:
                        (PrivateBeamTest.privacy_id_extractor(x), x[1]**2),
                        pcol_input)))
            self.assertEqual(transformed._budget_accountant, budget_accountant)
Exemplo n.º 5
0
    def test_flatmap_returns_correct_results_and_accountant(self):
        def flat_map_fn(x):
            return [(x[0], x[1] + i) for i in range(2)]

        runner = fn_api_runner.FnApiRunner()
        with beam.Pipeline(runner=runner) as pipeline:
            # Arrange
            pcol_input = [(1, 2), (2, 3), (3, 4)]
            pcol = pipeline | 'Create produce' >> beam.Create(pcol_input)
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=1, total_delta=0.01)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=PrivateBeamTest.privacy_id_extractor))

            # Act
            transformed = private_collection | private_beam.FlatMap(
                flat_map_fn)

            # Assert
            self.assertIsInstance(transformed, private_beam.PrivatePCollection)
            beam_util.assert_that(
                transformed._pcol,
                beam_util.equal_to([('pid:(1, 2)', (1, 2)),
                                    ('pid:(1, 2)', (1, 3)),
                                    ('pid:(2, 3)', (2, 3)),
                                    ('pid:(2, 3)', (2, 4)),
                                    ('pid:(3, 4)', (3, 4)),
                                    ('pid:(3, 4)', (3, 5))]))
            self.assertEqual(transformed._budget_accountant, budget_accountant)
Exemplo n.º 6
0
    def test_privacy_id_count_returns_sensible_result(self):
        with TestPipeline() as pipeline:
            # Arrange
            col = [(u, "pk1") for u in range(30)]
            pcol = pipeline | 'Create produce' >> beam.Create(col)
            # Use very high epsilon and delta to minimize noise and test
            # flakiness.
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=800, total_delta=0.999)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=lambda x: x[0]))

            privacy_id_count_params = aggregate_params.PrivacyIdCountParams(
                noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                max_partitions_contributed=2,
                budget_weight=1,
                partition_extractor=lambda x: x[1])

            # Act
            result = private_collection | private_beam.PrivacyIdCount(
                privacy_id_count_params=privacy_id_count_params)
            budget_accountant.compute_budgets()

            # Assert
            # This is a health check to validate that the result is sensible.
            # Hence, we use a very large tolerance to reduce test flakiness.
            beam_util.assert_that(
                result,
                beam_util.equal_to([("pk1", 30)],
                                   equals_fn=lambda e, a: PrivateBeamTest.
                                   value_per_key_within_tolerance(e, a, 5)))
Exemplo n.º 7
0
def main(unused_argv):
    # Setup Beam

    # Here, we use a local Beam runner.
    # For a truly distributed calculation, connect to a Beam cluster (e.g.
    # running on some cloud provider).
    runner = fn_api_runner.FnApiRunner()  # Local Beam runner
    with beam.Pipeline(runner=runner) as pipeline:

        # Define the privacy budget available for our computation.
        budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1,
                                                              total_delta=1e-6)

        # Load and parse input data
        df = pd.read_csv(FLAGS.input_file)
        df.rename(inplace=True,
                  columns={
                      'VisitorId': 'user_id',
                      'Time entered': 'enter_time',
                      'Time spent (minutes)': 'spent_minutes',
                      'Money spent (euros)': 'spent_money',
                      'Day': 'day'
                  })
        restaraunt_visits_rows = [index_row[1] for index_row in df.iterrows()]
        beam_data = pipeline | beam.Create(restaraunt_visits_rows)

        # Wrap Beam's PCollection into it's private version
        private_restaraunt_visits = beam_data | private_beam.MakePrivate(
            budget_accountant=budget_accountant,
            privacy_id_extractor=lambda row: row.user_id)

        # Calculate the private sum
        dp_result = private_restaraunt_visits | private_beam.Sum(
            SumParams(noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                      max_partitions_contributed=7,
                      max_contributions_per_partition=2,
                      min_value=1,
                      max_value=100,
                      budget_weight=1,
                      public_partitions=None,
                      partition_extractor=lambda row: row.day,
                      value_extractor=lambda row: row.spent_money))
        budget_accountant.compute_budgets()

        # Save the results
        dp_result | beam.io.WriteToText(FLAGS.output_file)

    return 0
Exemplo n.º 8
0
    def test_sum_calls_aggregate_with_params(self, mock_aggregate):
        runner = fn_api_runner.FnApiRunner()
        with beam.Pipeline(runner=runner) as pipeline:
            # Arrange
            pcol = pipeline | 'Create produce' >> beam.Create(
                float(i) for i in range(1, 7))
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=1, total_delta=0.01)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=PrivateBeamTest.privacy_id_extractor))

            sum_params = aggregate_params.SumParams(
                noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                max_partitions_contributed=2,
                max_contributions_per_partition=3,
                min_value=1,
                max_value=5,
                budget_weight=1,
                public_partitions=[],
                partition_extractor=lambda x: f"pk:{x // 10}",
                value_extractor=lambda x: x)

            # Act
            transformer = private_beam.Sum(sum_params=sum_params)
            private_collection | transformer

            # Assert
            self.assertEqual(transformer._budget_accountant, budget_accountant)
            mock_aggregate.assert_called_once()

            args = mock_aggregate.call_args[0]

            params = pipeline_dp.AggregateParams(
                noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                metrics=[pipeline_dp.Metrics.SUM],
                max_partitions_contributed=sum_params.
                max_partitions_contributed,
                max_contributions_per_partition=sum_params.
                max_contributions_per_partition,
                min_value=sum_params.min_value,
                max_value=sum_params.max_value,
                public_partitions=sum_params.public_partitions)
            self.assertEqual(params, args[1])
Exemplo n.º 9
0
def main(unused_argv):
    # Setup Beam

    # Here, we use a local Beam runner.
    # For a truly distributed calculation, connect to a Beam cluster (e.g.
    # running on some cloud provider).
    runner = fn_api_runner.FnApiRunner()  # Local Beam runner
    with beam.Pipeline(runner=runner) as pipeline:

        # Define the privacy budget available for our computation.
        budget_accountant = pipeline_dp.NaiveBudgetAccountant(total_epsilon=1,
                                                              total_delta=1e-6)

        # Load and parse input data
        movie_views_pcol = pipeline | \
                           beam.io.ReadFromText(FLAGS.input_file) | \
                           beam.ParDo(ParseFile())

        # Wrap Beam's PCollection into it's private version
        private_movie_views = (
            movie_views_pcol | 'Create private collection' >>
            pbeam.MakePrivate(budget_accountant=budget_accountant,
                              privacy_id_extractor=lambda mv: mv.user_id))

        private_movie_views = private_movie_views | pbeam.Map(
            lambda mv: (mv.movie_id, mv.rating))

        # Calculate the private sum
        dp_result = private_movie_views | pbeam.CombinePerKey(
            DPSumCombineFn(min_value=1, max_value=5),
            pbeam.CombinePerKeyParams(
                # Limits to how much one user can contribute:
                # .. at most two movies rated per user
                max_partitions_contributed=2,
                # .. at most one rating for each movie
                max_contributions_per_partition=1))
        budget_accountant.compute_budgets()

        # Save the results
        dp_result | beam.io.WriteToText(FLAGS.output_file)

    return 0
Exemplo n.º 10
0
    def test_make_private_transform_succeeds(self):
        runner = fn_api_runner.FnApiRunner()
        with beam.Pipeline(runner=runner) as pipeline:
            # Arrange
            pcol = pipeline | 'Create produce' >> beam.Create(
                [1, 2, 3, 4, 5, 6])
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=1, total_delta=0.01)

            # Act
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=PrivateBeamTest.privacy_id_extractor))

            # Assert
            self.assertIsInstance(private_collection,
                                  private_beam.PrivatePCollection)
            self.assertEqual(private_collection._budget_accountant,
                             budget_accountant)
Exemplo n.º 11
0
    def test_transform_with_return_anonymized_enabled_returns_pcollection(
            self):
        runner = fn_api_runner.FnApiRunner()
        with beam.Pipeline(runner=runner) as pipeline:
            # Arrange
            pcol = pipeline | 'Create produce' >> beam.Create(
                [1, 2, 3, 4, 5, 6])
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=1, total_delta=0.01)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=PrivateBeamTest.privacy_id_extractor))

            # Act
            transformed = private_collection | SimplePrivatePTransform(
                return_anonymized=True)

            # Assert
            self.assertIsInstance(transformed, pvalue.PCollection)
Exemplo n.º 12
0
    def test_variance_with_public_partitions_returns_sensible_result(self):
        with TestPipeline() as pipeline:
            # Arrange
            col = [(f"{u}", "pubK1", -100.0) for u in range(30)]
            col += [(f"{u + 30}", "pubK1", 100.0) for u in range(10)]
            col += [(f"{u + 40}", "privK1", 100.0) for u in range(30)]
            pcol = pipeline | 'Create produce' >> beam.Create(col)
            # Use very high epsilon and delta to minimize noise and test
            # flakiness.
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=8000, total_delta=0.9999999)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=lambda x: x[0]))

            variance_params = aggregate_params.VarianceParams(
                noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                max_partitions_contributed=1,
                max_contributions_per_partition=1,
                min_value=1.55,  # -100 should be clipped to this value
                max_value=2.7889,  # 100 should be clipped to this value
                budget_weight=1,
                partition_extractor=lambda x: x[1],
                value_extractor=lambda x: x[2])

            # Act
            result = private_collection | private_beam.Variance(
                variance_params=variance_params,
                public_partitions=["pubK1", "pubK2"])
            budget_accountant.compute_budgets()

            # Assert
            # This is a health check to validate that the result is sensible.
            # Hence, we use a very large tolerance to reduce test flakiness.
            beam_util.assert_that(
                result,
                # pubK2 has no data points therefore the dataset is assumed to be {min_value, max_value}
                beam_util.equal_to([("pubK1", 0.288), ("pubK2", 0.0)],
                                   equals_fn=lambda e, a: PrivateBeamTest.
                                   value_per_key_within_tolerance(e, a, 0.1)))
Exemplo n.º 13
0
    def test_private_collection_with_non_private_transform_throws_error(self):
        runner = fn_api_runner.FnApiRunner()
        with beam.Pipeline(runner=runner) as pipeline:
            # Arrange
            pcol = pipeline | 'Create produce' >> beam.Create(
                [1, 2, 3, 4, 5, 6])
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=1, total_delta=0.01)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=PrivateBeamTest.privacy_id_extractor))

            # Act and Assert
            with self.assertRaises(TypeError) as context:
                (private_collection | 'Non private transform on '
                 'PrivatePCollection' >> beam.Map(lambda x: x))
            self.assertIsInstance(private_collection,
                                  private_beam.PrivatePCollection)
            self.assertTrue(
                "private_transform should be of type "
                "PrivatePTransform but is " in str(context.exception))
Exemplo n.º 14
0
    def test_privacy_id_count_calls_aggregate_with_params(
            self, mock_aggregate):
        runner = fn_api_runner.FnApiRunner()
        with beam.Pipeline(runner=runner) as pipeline:
            # Arrange
            pcol = pipeline | 'Create produce' >> beam.Create(
                [1, 2, 3, 4, 5, 6])
            budget_accountant = budget_accounting.NaiveBudgetAccountant(
                total_epsilon=1, total_delta=0.01)
            private_collection = (
                pcol | 'Create private collection' >> private_beam.MakePrivate(
                    budget_accountant=budget_accountant,
                    privacy_id_extractor=PrivateBeamTest.privacy_id_extractor))

            privacy_id_count_params = aggregate_params.PrivacyIdCountParams(
                noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                max_partitions_contributed=2,
                budget_weight=1,
                partition_extractor=lambda x: f"pk:{x // 10}")

            # Act
            transformer = private_beam.PrivacyIdCount(
                privacy_id_count_params=privacy_id_count_params)
            private_collection | transformer

            # Assert
            self.assertEqual(transformer._budget_accountant, budget_accountant)
            mock_aggregate.assert_called_once()

            args = mock_aggregate.call_args[0]

            params = pipeline_dp.AggregateParams(
                noise_kind=pipeline_dp.NoiseKind.GAUSSIAN,
                metrics=[pipeline_dp.Metrics.PRIVACY_ID_COUNT],
                max_partitions_contributed=privacy_id_count_params.
                max_partitions_contributed,
                max_contributions_per_partition=1,
                public_partitions=privacy_id_count_params.public_partitions)
            self.assertEqual(args[1], params)