def test_samples_high_weight_elements_priority(self):
        """Checks that high-weight elements are sampled (using priority sampling).

    For threshold t, an element with weight at least 1/t will always
    be sampled, so this test should always succeed.
    """
        s = private_sampling.ThresholdSample(
            0.5, private_sampling.PrioritySamplingMethod)
        s.process("a", 2.0)
        s.process("b", 3.0)
        self.assertCountEqual(["a", "b"], s.elements.keys())
    def test_does_not_sample_negligible_weight_priority(self):
        """Checks that a very low weight element is not sampled (with priority).

    For the fixed threshold 1.0, an element with weight w is sampled with
    probability min{w,1}. For this test to fail with probability 1/10000000, we
    add an element with weight 1/10000000 and check that the element was not
    sampled.
    """
        s = private_sampling.ThresholdSample(
            1.0, private_sampling.PrioritySamplingMethod)
        s.process("a", 1.0 / FAILURE_PROBABILITY_INVERSE)
        self.assertEmpty(s.elements)
    def test_samples_high_weight_elements_ppswor(self):
        """Checks that an element with high weight is sampled when using PPSWOR.

    For the fixed threshold 1.0, an element with weight w is sampled with
    probability 1-exp(-w). Hence, this test uses an element with weight
    ln(10000000), so the test is supposed to fail (element not sampled) with
    probability 1/10000000.
    """
        s = private_sampling.ThresholdSample(
            1.0, private_sampling.PpsworSamplingMethod)
        s.process("a", math.log(FAILURE_PROBABILITY_INVERSE, math.e))
        self.assertCountEqual(["a"], s.elements.keys())
    def test_does_not_sample_twice_ppswor(self):
        """Checks that an exception is raised when processing the same key twice.

    The exception is raised when we process a key that is already in the sample
    (this event should not happen since we assume the data is aggregated).
    To implement that, we start with an element with high weight (and is thus
    sampled with high probability), and then try to add it again.
    As in test_samples_high_weight_elements_ppswor, the test fails with
    probability 1/10000000 (happens when the first element is not sampled).
    """
        with self.assertRaises(ValueError):
            s = private_sampling.ThresholdSample(
                1.0, private_sampling.PpsworSamplingMethod)
            s.process("a", math.log(FAILURE_PROBABILITY_INVERSE, math.e))
            s.process("a", 1)
    def test_estimate_full_statistics_priority(self):
        """Checks the estimate for the full statistics (using priority sampling).

    We check the function that estimates the full statistics on a dataset where
    all the elements are sampled with probability 1.0. As a result, the estimate
    for the statistics should be exactly accurate.

    As in test_samples_high_weight_elements_priority, the elements are sampled
    since for threshold t, an element with weight at least 1/t will always be
    sampled.
    """
        s = private_sampling.ThresholdSample(
            0.5, private_sampling.PrioritySamplingMethod)
        s.process("a", 2.0)
        s.process("b", 3.0)
        self.assertEqual(s.estimate_full_statistics(), 5.0)
    def test_does_not_sample_negligible_weight_ppswor(self):
        """Checks that a very low weight element is not sampled (with PPSWOR).

    For the fixed threshold 1.0, an element with weight w is sampled with
    probability 1-exp(-w). For this test to fail with probability 1/10000000,
    we add an element with weight ln(10000000/(10000000 - 1)) and check that the
    element was not sampled.
    """
        s = private_sampling.ThresholdSample(
            1.0, private_sampling.PpsworSamplingMethod)
        s.process(
            "a",
            math.log(
                FAILURE_PROBABILITY_INVERSE /
                (FAILURE_PROBABILITY_INVERSE - 1), math.e))
        self.assertEmpty(s.elements)
    def test_does_not_sample_twice_priority(self):
        """Checks that an exception is raised when processing the same key twice.

    The exception is raised when we process a key that is already in the sample
    (this event should not happen since we assume the data is aggregated).
    To implement that, we start with an element with high weight (that is
    always sampled for priority sampling with this threshold), and then try to
    add it again.
    See test_samples_high_weight_elements_priority for why the first element
    is always sampled.
    """
        with self.assertRaises(ValueError):
            s = private_sampling.ThresholdSample(
                0.5, private_sampling.PrioritySamplingMethod)
            s.process("a", 2.0)
            s.process("a", 0.1)
    def test_estimate_full_statistics_ppswor(self):
        """Checks the estimate for the full statistics (using PPSWOR).

    We check that the function that estimates the full statistics on a dataset
    that contains one element which is sampled with probability 1-1/10000000
    (as in test_samples_high_weight_elements_ppswor). We compare the output of
    estimate_full_statistics with the estimate we should get when the element is
    sampled. Therefore, the test should fail with probability 1/10000000 (when
    the element is not sampled).
    """
        s = private_sampling.ThresholdSample(
            1.0, private_sampling.PpsworSamplingMethod)
        element_weight = math.log(FAILURE_PROBABILITY_INVERSE, math.e)
        s.process("a", element_weight)
        sampling_probability = (FAILURE_PROBABILITY_INVERSE -
                                1) / FAILURE_PROBABILITY_INVERSE
        self.assertEqual(s.estimate_full_statistics(),
                         element_weight / sampling_probability)
  def test_high_delta_sample_stays_the_same(self, sampling_class,
                                            sampling_method):
    """Makes a non-private sample private, and checks it is the same (delta=1).

    This test checks the functions that create a private sample form an existing
    non-private threshold sample. When delta is 1.0, privacy does not add
    constraints, so the new private sample should contain the same elements as
    the non-private sample.

    Args:
      sampling_class: The private sampling class to be tested
      sampling_method: The underlying sampling method
    """
    s = private_sampling.ThresholdSample(0.5, sampling_method)
    for i in range(2000):
      s.process(i, 1)
    private_priority_sample = sampling_class.from_non_private(
        s, eps=0.1, delta=1.0)
    self.assertCountEqual(s.elements.keys(), private_priority_sample.elements)
  def test_samples_close_to_inclusion_probability_priority(self):
    """Confirms sampling close to the correct inclusion probability (priority).

    The test works as follows: We create an empty sample and process n (a large
    number) elements into it, such that each element is sampled with
    probability 0.5. Then, we check that between 0.49n and 0.51n elements were
    sampled. The number n needed to ensure that the test fails with probability
    at most 1/10000000 is computed using Chernoff bounds.
    """
    # The range we allow around 0.5n
    distance_from_half = 0.01
    # The number of elements we use (computed using Chernoff bounds)
    n = int((6.0 / (distance_from_half**2)) *
            math.log(2 * FAILURE_PROBABILITY_INVERSE, math.e) + 1)
    s = private_sampling.ThresholdSample(
        0.5, private_sampling.PrioritySamplingMethod)
    for i in range(n):
      s.process(i, 1.0)
    self.assertGreaterEqual(len(s.elements), (0.5 - distance_from_half) * n)
    self.assertLessEqual(len(s.elements), (0.5 + distance_from_half) * n)