Пример #1
0
def generate_synthetic_dataset(num_points: int = 1000000,
                               dim: int = 100,
                               num_clusters: int = 64,
                               cluster_ratio: float = 100.0,
                               radius: float = 1.0) -> clustering_params.Data:
    """Generates a synthetic dataset.

  First samples cluster centers within a smaller radius of
  radius*(1-1/cluster_ratio), so that points added around them stay within
  radius. Next, num_points/num_clusters many points are sampled from the
  Gaussian distribution centered at each cluster (if num_points/num_clusters is
  not an integer, then excess points are in the last cluster). Finally, points
  are clipped to norm=radius.

  Args:
    num_points: The number of data points.
    dim: The dimension of data points.
    num_clusters: The number of clusters to divide the points evenly into;
      extras go in the last cluster.
    cluster_ratio: The ratio of the intercluster distance to intracluster
      distance.
    radius: The radius for all the data to be confined in. At the end, this
      radius is enforced by scaling any points that are outside the radius.

  Returns:
    Data containing sampled datapoints, radius, and labels.
  """
    center_radius = radius * (1 - 1 / float(cluster_ratio))
    rand_centers: np.ndarray = sample_uniform_sphere(
        num_clusters, dim, center_radius)  # shape=(num_clusters, dim)
    datapoints: np.ndarray = np.random.normal(
        0,
        np.sqrt(radius) / (float(cluster_ratio) * np.sqrt(dim)),
        size=(num_points, dim))

    num_points_per_cluster: np.ndarray = np.ones(
        num_clusters, dtype=int) * (num_points // num_clusters)
    num_points_per_cluster[-1] += num_points % num_clusters

    labels = np.concatenate([
        np.ones(k, dtype=int) * i
        for (i, k) in enumerate(num_points_per_cluster)
    ])
    shift_mat: np.ndarray = np.vstack([
        np.outer(np.ones(k), v)
        for (k, v) in zip(num_points_per_cluster, rand_centers)
    ])
    datapoints += shift_mat

    # Enforce the radius by scaling any points that are outside that range.
    data = clustering_params.Data(datapoints, radius, labels)
    return clustering_params.Data(data.clip_by_radius(), data.radius,
                                  data.labels)
Пример #2
0
  def test_default_tree_param(self, points, returned_private_count, k, epsilon,
                              expected_min_num_points_in_branching_node,
                              expected_min_num_points_in_node,
                              expected_max_depth, mock_gaussian_noise,
                              mock_private_count):
    dim = 10
    mock_private_count.return_value = returned_private_count
    data = clustering_params.Data(np.ones(shape=(points, dim)), radius=1.0)
    privacy_param = clustering_params.DifferentialPrivacyParam(
        epsilon=epsilon, delta=1e-2)
    budget_split = clustering_params.PrivacyBudgetSplit(
        frac_sum=0.8, frac_group_count=0.2)

    (tree_param, private_count) = default_clustering_params.default_tree_param(
        k, data, privacy_param, budget_split)
    self.assertEqual(tree_param.max_depth, expected_max_depth)
    if epsilon == np.inf:
      mock_gaussian_noise.assert_not_called()
    else:
      mock_gaussian_noise.assert_called_once_with(
          common.DifferentialPrivacyParameters(0.8 * epsilon, 1e-2), 1, 1.0)
    mock_private_count.assert_called_once_with(
        nonprivate_count=points,
        count_privacy_param=central_privacy_utils.CountPrivacyParam(
            epsilon=0.2 * epsilon / (tree_param.max_depth + 1), delta=1e-2))
    self.assertEqual(private_count, returned_private_count)
    self.assertEqual(tree_param.min_num_points_in_node,
                     expected_min_num_points_in_node)
    self.assertEqual(tree_param.min_num_points_in_branching_node,
                     expected_min_num_points_in_branching_node)
Пример #3
0
def private_lsh_clustering(
        k: int,
        data: clustering_params.Data,
        privacy_param: clustering_params.DifferentialPrivacyParam,
        privacy_budget_split: typing.Optional[
            clustering_params.PrivacyBudgetSplit] = None,
        tree_param: typing.Optional[clustering_params.TreeParam] = None,
        short_description: str = "ClusteringParam") -> ClusteringResult:
    """Clusters data into k clusters.

  Args:
    k: Number of clusters to divide the data into.
    data: Data to find centers for. Centering the data around the origin
      beforehand may provide performance improvements.
    privacy_param: Differential privacy parameters.
    privacy_budget_split: Optional privacy budget split between operations in
      the clustering algorithm for fine-tuning.
    tree_param: Optional tree parameters for generating the LSH net tree for
      fine-tuning.
    short_description: Optional description to identify this parameter
      configuration.

  Returns:
    ClusteringResult with differentially private centers. The rest of
    ClusteringResult is nonprivate, and only provided for convenience.
  """
    # Initialize the parameters.
    if privacy_budget_split is None:
        privacy_budget_split = clustering_params.PrivacyBudgetSplit()
    private_count = None
    if tree_param is None:
        # Saves the private count to re-use for the root node of the tree.
        tree_param, private_count = default_clustering_params.default_tree_param(
            k, data, privacy_param, privacy_budget_split)
    clustering_param = clustering_params.ClusteringParam(
        privacy_param, privacy_budget_split, tree_param, short_description,
        data.radius)
    logging.debug("clustering_param: %s", clustering_param)

    # To guarantee privacy, enforce the radius provided.
    clipped_data = clustering_params.Data(data.clip_by_radius(), data.radius,
                                          data.labels)

    coreset: private_outputs.PrivateWeightedData = get_private_coreset(
        clipped_data, clustering_param, private_count)

    k = min(k, len(coreset.datapoints))
    logging.debug(
        "Starting k-means++ computation on private coreset with k=%d. This may "
        "be less than the original if generated coreset data ended up with "
        "less than k unique points.", k)
    kmeans = sklearn.cluster.KMeans(n_clusters=k, init="k-means++").fit(
        coreset.datapoints, sample_weight=coreset.weights)
    # Calculate the result relative to the original data.
    # Note: the calculations besides the centers are nonprivate.
    return ClusteringResult(data, kmeans.cluster_centers_)
Пример #4
0
 def test_clustering_result_value_errors_unequal_points(self):
     centers = np.array([[0, 0, 0], [1, 1, 1]])
     datapoints = np.array([[1, 0, 1], [101, 101, 99], [4, 0, 4]])
     labels = np.array([0, 1], dtype=int)
     data = clustering_params.Data(datapoints=datapoints, radius=200)
     with self.assertRaises(ValueError):
         clustering_algorithm.ClusteringResult(data,
                                               centers,
                                               labels,
                                               loss=1.0)
Пример #5
0
 def test_clustering_result_value_errors_loss_label_only_one_init(self):
     centers = np.zeros((2, 3))
     datapoints = np.zeros((4, 3))
     data = clustering_params.Data(datapoints=datapoints, radius=2)
     cluster_labels = np.array([0, 0, 1, 1], dtype=int)
     loss = 1.0
     with self.assertRaises(ValueError):
         clustering_algorithm.ClusteringResult(data, centers,
                                               cluster_labels)
     with self.assertRaises(ValueError):
         clustering_algorithm.ClusteringResult(data, centers, loss=loss)
Пример #6
0
 def test_value_error_no_true_labels(self):
     datapoints, radius = np.zeros(shape=(6, 4)), 1.0
     data = clustering_params.Data(datapoints, radius)
     centers = np.zeros(shape=(3, 4))
     cluster_labels = np.array([0, 0, 1, 1, 2, 2])
     clustering_result = clustering_algorithm.ClusteringResult(
         data, centers, cluster_labels, loss=1.0)
     with self.assertRaises(ValueError):
         clustering_result.cross_label_histogram()
     with self.assertRaises(ValueError):
         clustering_result.get_clustering_metrics()
Пример #7
0
 def test_root_node_provide_private_count(self):
     nonprivate_points = [[1, 2, 1], [0.4, 0.2, 0.8], [3, 0, 3]]
     data = clustering_params.Data(nonprivate_points, radius=4.3)
     clustering_param = test_utils.get_test_clustering_param(radius=4.3,
                                                             max_depth=20)
     root = lsh_tree.root_node(data, clustering_param, private_count=10)
     self.assertEqual(root.hash_prefix, '')
     self.assertSequenceEqual(root.nonprivate_points, nonprivate_points)
     self.assertEqual(root.clustering_param, clustering_param)
     self.assertEqual(root.sim_hash.dim, 3)
     self.assertEqual(root.sim_hash.max_hash_len, 20)
     self.assertEqual(root.private_count, 10)
Пример #8
0
 def test_clustering_result_value_errors_labels_out_of_bounds(self):
     centers = np.array([[0, 0, 0], [1, 1, 1]])
     datapoints = np.array([[1, 0, 1], [101, 101, 99], [4, 0, 4]])
     data = clustering_params.Data(datapoints=datapoints, radius=200)
     for labels in [
             np.array([-1, 0, 1], dtype=int),
             np.array([0, 1, 2], dtype=int),
             np.array([0, 1, 1.1])
     ]:
         with self.assertRaises(ValueError):
             clustering_algorithm.ClusteringResult(data,
                                                   centers,
                                                   labels,
                                                   loss=1.0)
Пример #9
0
 def test_clip_by_radius_default_to_self(self):
   datapoints = np.array([[0., 0., 0., 0.], [1., 2., 3., 4.], [5., 6., 7., 8.],
                          [9., 10., 11., 12.], [13., 14., 15., 16.]])
   data = clustering_params.Data(datapoints, radius=10.0)
   clipped_datapoints = data.clip_by_radius()
   self.assertLen(clipped_datapoints, 5)
   self.assertSequenceAlmostEqual(clipped_datapoints[0], [0., 0., 0., 0.])
   self.assertSequenceAlmostEqual(clipped_datapoints[1], [1., 2., 3., 4.])
   self.assertSequenceAlmostEqual(
       clipped_datapoints[2], [3.79049022, 4.54858826, 5.30668631, 6.06478435])
   self.assertSequenceAlmostEqual(
       clipped_datapoints[3], [4.26162351, 4.73513724, 5.20865096, 5.68216469])
   self.assertSequenceAlmostEqual(
       clipped_datapoints[4], [4.46949207, 4.81329915, 5.15710623, 5.50091331])
Пример #10
0
    def test_get_clustering_result(self):
        centers = np.array([[0, 0, 0], [100, 100, 100]])
        datapoints = np.array([[1, 0, 1], [101, 101, 99], [4, 0, 4]])
        data = clustering_params.Data(datapoints=datapoints, radius=200)

        clustering_result = clustering_algorithm.ClusteringResult(
            data, centers)

        self.assertLen(data.datapoints, 3)
        for i, datapoint in enumerate(clustering_result.data.datapoints):
            self.assertSequenceAlmostEqual(datapoints[i], datapoint)
        self.assertLen(centers, 2)
        for i, center in enumerate(clustering_result.centers):
            self.assertSequenceAlmostEqual(centers[i], center)

        self.assertListEqual(list(clustering_result.labels), [0, 1, 0])
        self.assertAlmostEqual(clustering_result.loss, 37)
Пример #11
0
    def test_clipped_data_used_for_clustering_and_not_result_calculation(self):
        # Clipped datapoints (radius=1): [[0.3, 0.2], [0.6, 0.8], [0.6, 0.8]]
        datapoints = np.array([[0.3, 0.2], [3, 4], [6, 8]])
        # Very small radius means the datapoint will be clipped for the center
        # calculation.
        data = clustering_params.Data(datapoints=datapoints, radius=1)
        # No noise
        privacy_param = clustering_params.DifferentialPrivacyParam(np.inf)
        # No branching, the coreset will just be the average of the points
        tree_param = clustering_params.TreeParam(1, 1, 0)
        clustering_result = clustering_algorithm.private_lsh_clustering(
            3, data, privacy_param, tree_param=tree_param)

        # Center should be calculated using the clipped data.
        expected_center = np.array([0.5, 0.6])
        self.assertLen(clustering_result.centers, 1)
        self.assertSequenceAlmostEqual(clustering_result.centers[0],
                                       expected_center)

        self.assertListEqual(list(clustering_result.labels), [0, 0, 0])

        # Loss calculation should still be relative to the original points.
        self.assertAlmostEqual(clustering_result.loss, 103.02)
Пример #12
0
    def test_get_clustering_metrics(self):
        datapoints, radius = np.zeros(shape=(6, 4)), 1.0
        labels = np.array([0, 0, 0, 1, 1, 1])
        data = clustering_params.Data(datapoints, radius, labels)
        centers = np.zeros(shape=(3, 4))
        cluster_labels = np.array([0, 0, 1, 1, 2, 2])
        clustering_result = clustering_algorithm.ClusteringResult(
            data, centers, cluster_labels, loss=1.0)
        clustering_metrics = clustering_result.get_clustering_metrics()

        expected_cross_label_histogram = np.array([[2, 0], [1, 1], [0, 2]],
                                                  dtype=int)
        self.assertTrue((clustering_metrics.cross_label_histogram ==
                         expected_cross_label_histogram).all())
        self.assertEqual(clustering_metrics.num_points, 6)
        self.assertEqual(clustering_metrics.dominant_label_correct_count, 5)
        self.assertAlmostEqual(clustering_metrics.dominant_label_accuracy,
                               5 / 6)
        self.assertEqual(clustering_metrics.true_pairs, 6)
        self.assertEqual(clustering_metrics.true_nonmatch_count, 4)
        self.assertAlmostEqual(clustering_metrics.true_nonmatch_frac, 4 / 6)
        self.assertEqual(clustering_metrics.false_pairs, 9)
        self.assertEqual(clustering_metrics.false_match_count, 1)
        self.assertAlmostEqual(clustering_metrics.false_match_frac, 1 / 9)
Пример #13
0
 def test_small_dataset(self):
     datapoints = np.array([[0.3, 0.2]])
     data = clustering_params.Data(datapoints=datapoints, radius=1)
     self.assertIsNotNone(
         clustering_algorithm.private_lsh_clustering(
             self.baseline_k, data, self.baseline_privacy_param))
Пример #14
0
 def test_data_label_unequal_length(self):
   points, dim = 10, 3
   datapoints = np.zeros(shape=(points, dim))
   labels = np.ones(points-1, dtype=int)
   with self.assertRaises(ValueError):
     clustering_params.Data(datapoints, radius=1.0, labels=labels)
Пример #15
0
 def test_data(self):
   (points, dim) = (10, 3)
   data = clustering_params.Data(np.ones(shape=(points, dim)), radius=1.0)
   self.assertEqual(data.num_points, points)
   self.assertEqual(data.dim, dim)
   self.assertEqual(data.radius, 1.0)