def get_test_clustering_param(epsilon=1.0, delta=1e-2, frac_sum=0.2, frac_group_count=0.8, min_num_points_in_branching_node=4, min_num_points_in_node=2, max_depth=4, radius=1): # pylint: disable=g-doc-args """Returns clustering_param with defaults for params not needed for testing. Usage: Explicitly pass in parameters that are relied on in the test. """ privacy_param = clustering_params.DifferentialPrivacyParam( epsilon=epsilon, delta=delta) privacy_budget_split = clustering_params.PrivacyBudgetSplit( frac_sum=frac_sum, frac_group_count=frac_group_count) tree_param = clustering_params.TreeParam( min_num_points_in_branching_node=min_num_points_in_branching_node, min_num_points_in_node=min_num_points_in_node, max_depth=max_depth) clustering_param = clustering_params.ClusteringParam( privacy_param=privacy_param, privacy_budget_split=privacy_budget_split, tree_param=tree_param, short_description='TestClusteringParam', radius=radius) return clustering_param
def test_default_tree_param(self, points, returned_private_count, k, epsilon, expected_min_num_points_in_branching_node, expected_min_num_points_in_node, expected_max_depth, mock_gaussian_noise, mock_private_count): dim = 10 mock_private_count.return_value = returned_private_count data = clustering_params.Data(np.ones(shape=(points, dim)), radius=1.0) privacy_param = clustering_params.DifferentialPrivacyParam( epsilon=epsilon, delta=1e-2) budget_split = clustering_params.PrivacyBudgetSplit( frac_sum=0.8, frac_group_count=0.2) (tree_param, private_count) = default_clustering_params.default_tree_param( k, data, privacy_param, budget_split) self.assertEqual(tree_param.max_depth, expected_max_depth) if epsilon == np.inf: mock_gaussian_noise.assert_not_called() else: mock_gaussian_noise.assert_called_once_with( common.DifferentialPrivacyParameters(0.8 * epsilon, 1e-2), 1, 1.0) mock_private_count.assert_called_once_with( nonprivate_count=points, count_privacy_param=central_privacy_utils.CountPrivacyParam( epsilon=0.2 * epsilon / (tree_param.max_depth + 1), delta=1e-2)) self.assertEqual(private_count, returned_private_count) self.assertEqual(tree_param.min_num_points_in_node, expected_min_num_points_in_node) self.assertEqual(tree_param.min_num_points_in_branching_node, expected_min_num_points_in_branching_node)
def private_lsh_clustering( k: int, data: clustering_params.Data, privacy_param: clustering_params.DifferentialPrivacyParam, privacy_budget_split: typing.Optional[ clustering_params.PrivacyBudgetSplit] = None, tree_param: typing.Optional[clustering_params.TreeParam] = None, short_description: str = "ClusteringParam") -> ClusteringResult: """Clusters data into k clusters. Args: k: Number of clusters to divide the data into. data: Data to find centers for. Centering the data around the origin beforehand may provide performance improvements. privacy_param: Differential privacy parameters. privacy_budget_split: Optional privacy budget split between operations in the clustering algorithm for fine-tuning. tree_param: Optional tree parameters for generating the LSH net tree for fine-tuning. short_description: Optional description to identify this parameter configuration. Returns: ClusteringResult with differentially private centers. The rest of ClusteringResult is nonprivate, and only provided for convenience. """ # Initialize the parameters. if privacy_budget_split is None: privacy_budget_split = clustering_params.PrivacyBudgetSplit() private_count = None if tree_param is None: # Saves the private count to re-use for the root node of the tree. tree_param, private_count = default_clustering_params.default_tree_param( k, data, privacy_param, privacy_budget_split) clustering_param = clustering_params.ClusteringParam( privacy_param, privacy_budget_split, tree_param, short_description, data.radius) logging.debug("clustering_param: %s", clustering_param) # To guarantee privacy, enforce the radius provided. clipped_data = clustering_params.Data(data.clip_by_radius(), data.radius, data.labels) coreset: private_outputs.PrivateWeightedData = get_private_coreset( clipped_data, clustering_param, private_count) k = min(k, len(coreset.datapoints)) logging.debug( "Starting k-means++ computation on private coreset with k=%d. This may " "be less than the original if generated coreset data ended up with " "less than k unique points.", k) kmeans = sklearn.cluster.KMeans(n_clusters=k, init="k-means++").fit( coreset.datapoints, sample_weight=coreset.weights) # Calculate the result relative to the original data. # Note: the calculations besides the centers are nonprivate. return ClusteringResult(data, kmeans.cluster_centers_)
def test_private_count_param(self): privacy_param = clustering_params.DifferentialPrivacyParam(epsilon=10, delta=1e-2) privacy_budget_split = clustering_params.PrivacyBudgetSplit( frac_sum=0.2, frac_group_count=0.8) max_tree_depth = 3 count_privacy_param = CountPrivacyParam.compute_group_count_privacy_param( privacy_param, privacy_budget_split, max_tree_depth) self.assertEqual(count_privacy_param.epsilon, 2.0) self.assertEqual(count_privacy_param.delta, 1e-2)
def test_private_count_param(self): privacy_param = clustering_params.DifferentialPrivacyParam(epsilon=2.0, delta=1e-3) privacy_budget_split = clustering_params.PrivacyBudgetSplit( frac_sum=0.7, frac_group_count=0.3) max_tree_depth = 10 private_count_param = CentralPrivateCountParam(privacy_param, privacy_budget_split, max_tree_depth) self.assertEqual(private_count_param.privacy_param, privacy_param) self.assertEqual(private_count_param.privacy_budget_split, privacy_budget_split) self.assertEqual(private_count_param.max_tree_depth, max_tree_depth)
def test_clustering_param(self): privacy_param = clustering_params.DifferentialPrivacyParam() privacy_budget_split = clustering_params.PrivacyBudgetSplit() tree_param = clustering_params.TreeParam( min_num_points_in_branching_node=4, min_num_points_in_node=2, max_depth=5) clustering_param = clustering_params.ClusteringParam( privacy_param=privacy_param, privacy_budget_split=privacy_budget_split, tree_param=tree_param, short_description="TestClusteringParam", radius=20) self.assertEqual(clustering_param.privacy_param, privacy_param) self.assertEqual(clustering_param.privacy_budget_split, privacy_budget_split) self.assertEqual(clustering_param.tree_param, tree_param) self.assertEqual(clustering_param.short_description, "TestClusteringParam") self.assertEqual(clustering_param.radius, 20)
def test_privacy_budget_split_invalid(self): with self.assertRaises( ValueError, msg="The provided privacy budget split (1.6) was greater than 1.0."): clustering_params.PrivacyBudgetSplit(frac_sum=0.7, frac_group_count=0.8)
def test_privacy_budget_split_defaults(self): privacy_budget_split = clustering_params.PrivacyBudgetSplit() self.assertEqual(privacy_budget_split.frac_sum, 0.8) self.assertEqual(privacy_budget_split.frac_group_count, 0.2)