def compute_on_local(): public_partitions = get_public_partitions() movie_views = parse_file(FLAGS.input_file) pipeline_operations = pipeline_dp.LocalPipelineOperations() dp_result = list( calc_dp_rating_metrics(movie_views, pipeline_operations, public_partitions)) write_to_file(dp_result, FLAGS.output_file)
def test_contribution_bounding_empty_col(self): input_col = [] max_partitions_contributed = 2 max_contributions_per_partition = 2 dp_engine = pipeline_dp.DPEngine( NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-10), pipeline_dp.LocalPipelineOperations()) bound_result = list( dp_engine._bound_contributions( input_col, max_partitions_contributed=max_partitions_contributed, max_contributions_per_partition=max_contributions_per_partition, aggregator_fn=dp_engineTest.aggregator_fn)) self.assertFalse(bound_result)
def test_select_private_partitions(self): input_col = [("pid1", ('pk1', 1)), ("pid1", ('pk1', 2)), ("pid1", ('pk2', 3)), ("pid1", ('pk2', 4)), ("pid1", ('pk2', 5)), ("pid1", ('pk3', 6)), ("pid1", ('pk4', 7)), ("pid2", ('pk4', 8))] max_partitions_contributed = 3 engine = pipeline_dp.DPEngine( NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-10), pipeline_dp.LocalPipelineOperations()) groups = engine._ops.group_by_key(input_col, None) groups = engine._ops.map_values(groups, lambda group: _MockAccumulator(group)) groups = list(groups) expected_data_filtered = [("pid1", _MockAccumulator([ ('pk1', 1), ('pk1', 2), ('pk2', 3), ('pk2', 4), ('pk2', 5), ('pk3', 6), ('pk4', 7), ])), ("pid2", _MockAccumulator([('pk4', 8)]))] self._mock_and_assert_private_partitions(engine, groups, 0, expected_data_filtered, max_partitions_contributed) expected_data_filtered = [ ("pid1", _MockAccumulator([ ('pk1', 1), ('pk1', 2), ('pk2', 3), ('pk2', 4), ('pk2', 5), ('pk3', 6), ('pk4', 7), ])), ] self._mock_and_assert_private_partitions(engine, groups, 3, expected_data_filtered, max_partitions_contributed) expected_data_filtered = [] self._mock_and_assert_private_partitions(engine, groups, 100, expected_data_filtered, max_partitions_contributed)
def test_contribution_bounding_bound_input_nothing_dropped(self): input_col = [("pid1", 'pk1', 1), ("pid1", 'pk1', 2), ("pid1", 'pk2', 3), ("pid1", 'pk2', 4)] max_partitions_contributed = 2 max_contributions_per_partition = 2 dp_engine = pipeline_dp.DPEngine( NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-10), pipeline_dp.LocalPipelineOperations()) bound_result = list( dp_engine._bound_contributions( input_col, max_partitions_contributed=max_partitions_contributed, max_contributions_per_partition=max_contributions_per_partition, aggregator_fn=dp_engineTest.aggregator_fn)) expected_result = [(('pid1', 'pk2'), (2, 7, 25)), (('pid1', 'pk1'), (2, 3, 5))] self.assertEqual(set(expected_result), set(bound_result))
def test_aggregate_computation_graph_verification( self, mock_bound_contributions): # Arrange aggregator_params = pipeline_dp.AggregateParams([agg.Metrics.COUNT], 5, 3) budget_accountant = NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-10) accumulator_factory = AccumulatorFactory( params=aggregator_params, budget_accountant=budget_accountant) accumulator_factory.initialize() col = [[1], [2], [3], [3]] data_extractor = pipeline_dp.DataExtractors( privacy_id_extractor=lambda x: "pid" + str(x), partition_extractor=lambda x: "pk" + str(x), value_extractor=lambda x: x) mock_bound_contributions.return_value = [ [("pid1", "pk1"), CountAccumulator(params=None, values=[1])], [("pid2", "pk2"), CountAccumulator(params=None, values=[1])], [("pid3", "pk3"), CountAccumulator(params=None, values=[2])], ] engine = pipeline_dp.DPEngine( budget_accountant=budget_accountant, ops=pipeline_dp.LocalPipelineOperations()) col = engine.aggregate(col=col, params=aggregator_params, data_extractors=data_extractor) # Assert mock_bound_contributions.assert_called_with( unittest.mock.ANY, aggregator_params.max_partitions_contributed, aggregator_params.max_contributions_per_partition, unittest.mock.ANY)
def test_aggregate_report(self, mock_create_accumulator_params_function): col = [[1], [2], [3], [3]] data_extractor = pipeline_dp.DataExtractors( privacy_id_extractor=lambda x: "pid" + str(x), partition_extractor=lambda x: "pk" + str(x), value_extractor=lambda x: x) params1 = pipeline_dp.AggregateParams( max_partitions_contributed=3, max_contributions_per_partition=2, low=1, high=5, metrics=[ pipeline_dp.Metrics.PRIVACY_ID_COUNT, pipeline_dp.Metrics.COUNT, pipeline_dp.Metrics.MEAN ], ) params2 = pipeline_dp.AggregateParams( max_partitions_contributed=1, max_contributions_per_partition=3, low=2, high=10, metrics=[ pipeline_dp.Metrics.VAR, pipeline_dp.Metrics.SUM, pipeline_dp.Metrics.MEAN ], public_partitions=list(range(1, 40)), ) mock_create_accumulator_params_function.return_value = [ pipeline_dp.accumulator.AccumulatorParams( pipeline_dp.accumulator.CountAccumulator, None) ] engine = pipeline_dp.DPEngine( budget_accountant=NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-10), ops=pipeline_dp.LocalPipelineOperations()) engine.aggregate(col, params1, data_extractor) engine.aggregate(col, params2, data_extractor) self.assertEqual(len(engine._report_generators), 2) # pylint: disable=protected-access
def test_contribution_bounding_cross_partition_bounding_applied(self): input_col = [ ("pid1", 'pk1', 1), ("pid1", 'pk1', 2), ("pid1", 'pk2', 3), ("pid1", 'pk2', 4), ("pid1", 'pk2', 5), ("pid1", 'pk3', 6), ("pid1", 'pk4', 7), ("pid2", 'pk4', 8) ] max_partitions_contributed = 3 max_contributions_per_partition = 5 dp_engine = pipeline_dp.DPEngine( NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-10), pipeline_dp.LocalPipelineOperations()) bound_result = list( dp_engine._bound_contributions( input_col, max_partitions_contributed=max_partitions_contributed, max_contributions_per_partition=max_contributions_per_partition, aggregator_fn=dp_engineTest.aggregator_fn)) self.assertEqual(len(bound_result), 4) # Check contributions per partitions self.assertTrue( all( map( lambda op_val: op_val[1][0] <= max_contributions_per_partition, bound_result))) # Check cross partition contributions dict_of_pid_to_pk = collections.defaultdict(lambda: []) for key, _ in bound_result: dict_of_pid_to_pk[key[0]].append(key[1]) self.assertEqual(len(dict_of_pid_to_pk), 2) self.assertTrue( all( map( lambda key: len(dict_of_pid_to_pk[key]) <= max_partitions_contributed, dict_of_pid_to_pk)))
def test_contribution_bounding_per_partition_bounding_applied(self): input_col = [("pid1", 'pk1', 1), ("pid1", 'pk1', 2), ("pid1", 'pk2', 3), ("pid1", 'pk2', 4), ("pid1", 'pk2', 5), ("pid2", 'pk2', 6)] max_partitions_contributed = 5 max_contributions_per_partition = 2 dp_engine = pipeline_dp.DPEngine( NaiveBudgetAccountant(total_epsilon=1, total_delta=1e-10), pipeline_dp.LocalPipelineOperations()) bound_result = list( dp_engine._bound_contributions( input_col, max_partitions_contributed=max_partitions_contributed, max_contributions_per_partition=max_contributions_per_partition, aggregator_fn=dp_engineTest.aggregator_fn)) self.assertEqual(len(bound_result), 3) # Check contributions per partitions self.assertTrue( all( map( lambda op_val: op_val[1][0] <= max_contributions_per_partition, bound_result)))