def testMaintainsOrder(self): b = reservoir._ReservoirBucket(100) for i in xrange(10000): b.AddItem(i) items = b.Items() prev = -1 for item in items: self.assertTrue(item > prev) prev = item
def testRemovesItems(self): b = reservoir._ReservoirBucket(100) for i in xrange(10): b.AddItem(i) self.assertEqual(len(b.Items()), 10) self.assertEqual(b._num_items_seen, 10) self.assertEqual(b.FilterItems(lambda x: x <= 7), 2) self.assertEqual(len(b.Items()), 8) self.assertEqual(b._num_items_seen, 8)
def testRemovesItemsWhenItemsAreReplaced(self): b = reservoir._ReservoirBucket(100) for i in xrange(10000): b.AddItem(i) self.assertEqual(b._num_items_seen, 10000) # Remove items num_removed = b.FilterItems(lambda x: x <= 7) self.assertGreater(num_removed, 92) self.assertEqual([], [item for item in b.Items() if item > 7]) self.assertEqual(b._num_items_seen, int(round(10000 * (1 - float(num_removed) / 100))))
def testLazyFunctionEvaluationAndAlwaysKeepLast(self): class FakeRandom(object): def randint(self, a, b): # pylint:disable=unused-argument return 999 class Incrementer(object): def __init__(self): self.n = 0 def increment_and_double(self, x): self.n += 1 return x * 2 # We've mocked the randomness generator, so that once it is full, the last # item will never get durable reservoir inclusion. Since always_keep_last is # false, the function should only get invoked 100 times while filling up # the reservoir. This laziness property is an essential performance # optimization. b = reservoir._ReservoirBucket(100, FakeRandom(), always_keep_last=False) incrementer = Incrementer() for i in range(1000): b.AddItem(i, incrementer.increment_and_double) self.assertEqual(incrementer.n, 100) self.assertEqual(b.Items(), [x * 2 for x in range(100)]) # This time, we will always keep the last item, meaning that the function # should get invoked once for every item we add. b = reservoir._ReservoirBucket(100, FakeRandom(), always_keep_last=True) incrementer = Incrementer() for i in range(1000): b.AddItem(i, incrementer.increment_and_double) self.assertEqual(incrementer.n, 1000) self.assertEqual(b.Items(), [x * 2 for x in range(99)] + [999 * 2])
def testLazyFunctionEvaluationAndAlwaysKeepLast(self): class FakeRandom(object): def randint(self, a, b): # pylint:disable=unused-argument return 999 class Incrementer(object): def __init__(self): self.n = 0 def increment_and_double(self, x): self.n += 1 return x * 2 # We've mocked the randomness generator, so that once it is full, the last # item will never get durable reservoir inclusion. Since always_keep_last is # false, the function should only get invoked 100 times while filling up # the reservoir. This laziness property is an essential performance # optimization. b = reservoir._ReservoirBucket(100, FakeRandom(), always_keep_last=False) incrementer = Incrementer() for i in xrange(1000): b.AddItem(i, incrementer.increment_and_double) self.assertEqual(incrementer.n, 100) self.assertEqual(b.Items(), [x * 2 for x in xrange(100)]) # This time, we will always keep the last item, meaning that the function # should get invoked once for every item we add. b = reservoir._ReservoirBucket(100, FakeRandom(), always_keep_last=True) incrementer = Incrementer() for i in xrange(1000): b.AddItem(i, incrementer.increment_and_double) self.assertEqual(incrementer.n, 1000) self.assertEqual(b.Items(), [x * 2 for x in xrange(99)] + [999 * 2])
def testBucketReservoirSamplingViaStatisticalProperties(self): # Not related to a 'ReservoirBucket', but instead number of buckets we put # samples into for testing the shape of the distribution b = reservoir._ReservoirBucket(_max_size=self.samples) # add one extra item because we always keep the most recent item, which # would skew the distribution; we can just slice it off the end instead. for i in xrange(self.total + 1): b.AddItem(i) divbins = [0] * self.n_buckets modbins = [0] * self.n_buckets # Slice off the last item when we iterate. for item in b.Items()[0:-1]: divbins[item // self.total_per_bucket] += 1 modbins[item % self.n_buckets] += 1 for bucket_index in xrange(self.n_buckets): divbin = divbins[bucket_index] modbin = modbins[bucket_index] self.AssertBinomialQuantity(divbin) self.AssertBinomialQuantity(modbin)
def testEmptyBucket(self): b = reservoir._ReservoirBucket(1) self.assertFalse(b.Items())
def testDoesntOverfill(self): b = reservoir._ReservoirBucket(10) for i in xrange(1000): b.AddItem(i) self.assertEqual(len(b.Items()), 10) self.assertEqual(b._num_items_seen, 1000)
def testSizeRequirement(self): with self.assertRaises(ValueError): reservoir._ReservoirBucket(-1) with self.assertRaises(ValueError): reservoir._ReservoirBucket(10.3)
def testSizeZeroBucket(self): b = reservoir._ReservoirBucket(0) for i in xrange(20): b.AddItem(i) self.assertEqual(b.Items(), list(range(i + 1))) self.assertEqual(b._num_items_seen, 20)
def testSizeOneBucket(self): b = reservoir._ReservoirBucket(1) for i in xrange(20): b.AddItem(i) self.assertEqual(b.Items(), [i]) self.assertEqual(b._num_items_seen, 20)
def testKeepsLatestItem(self): b = reservoir._ReservoirBucket(5) for i in xrange(100): b.AddItem(i) last = b.Items()[-1] self.assertEqual(last, i)
def testFillToSize(self): b = reservoir._ReservoirBucket(100) for i in xrange(100): b.AddItem(i) self.assertEqual(b.Items(), list(xrange(100))) self.assertEqual(b._num_items_seen, 100)