def test_update_no_collisions(self): """ Tests the results of different algorithms with parameters chosen so that no hash collision affects results """ width = 2 ** 17 cardinality = width // 4 cms = CountMinSketch(width=width, depth=8, log_counting=self.log_counting) cms.update(generateData(cardinality)) expected = Counter() expected.update(generateData(cardinality)) bias, deviation, max_log_error, avg_log_error, max_d_error, max_error_expected = stats(cms, expected) self.assertAlmostEqual( max_log_error, 0, msg="Each result should be within maximum tolerance", delta=self.max_log_tolerance) self.assertAlmostEqual( avg_log_error, 0, msg="Average log deviation should be low", delta=self.avg_log_tolerance) self.assertAlmostEqual( bias, 0, msg="Total bias should be low", delta=self.total_bias_tolerance)
def test_depth_alg_init(self): data_set = [(None, 4, 2**18, 4), (None, 17, 2**19, 8), (1024, 70, 2**21, 17), (8, 100, 2**26, 1)] for (log_counting, size_mb, exp_width, depth) in data_set: cms = CountMinSketch(size_mb=size_mb, depth=depth, log_counting=log_counting) self.assertEqual(cms.width, exp_width) self.assertEqual(cms.depth, depth) self.assertLessEqual(cms.size(), size_mb * 1024 * 1024)
def test_width_depth_alg_init(self): data_set = [(None, 2**12, 3, 49152), (None, 2**13, 7, 229376), (1024, 2**18, 1, 524288), (8, 2**13, 7, 57344)] for (log_counting, width, depth, size) in data_set: cms = CountMinSketch(width=width, depth=depth, log_counting=log_counting) self.assertEqual(cms.width, width) self.assertEqual(cms.depth, depth) self.assertLessEqual(cms.size(), size)
class CountMinSketchQualityCommonTest(unittest.TestCase): def __init__(self, methodName='runTest', log_counting=None): self.log_counting = log_counting super(CountMinSketchQualityCommonTest, self).__init__(methodName=methodName) """ Functional tests for CountMinSketch.quality method, which returns quality rating of the structure """ def setUp(self): self.cms = CountMinSketch(1, log_counting=self.log_counting) def test_quality_default(self): """ Uses the default structure """ self.assertEqual(self.cms.quality(), 0) three_quarters = int((self.cms.width * 3) / 4) for i in range(three_quarters): self.cms.increment(str(i), 1 + (i % 13)) self.assertGreaterEqual(self.cms.quality(), 0.5) self.assertLessEqual(self.cms.quality(), 1.0) for i in range(three_quarters * 7): self.cms.increment(str(i), 1 + (i % 13)) self.assertGreaterEqual(self.cms.quality(), 4.0) self.assertLessEqual(self.cms.quality(), 6.0)
def test_invalid_width(self): CountMinSketch(size_mb=8, width=2**20) with self.assertRaises(ValueError): CountMinSketch(size_mb=8, width=2**20 - 1) # width must be a power of 2! with self.assertRaises(ValueError): CountMinSketch(size_mb=8, width=2**22) # width too large! with self.assertRaises(ValueError): CountMinSketch(width=2**22 - 1, depth=8) # width must be a power of 2!
def size_check(self, log_counting=None, width_adjustment=1): data_set = [(1, 2**15, 8), (2, 2**16, 8), (3, 2**16, 12), (4, 2**17, 8), (5, 2**17, 10), (6, 2**17, 12), (7, 2**17, 14), (8, 2**18, 8), (32, 2**20, 8), (55, 2**20, 13), (95, 2**21, 11), (256, 2**23, 8)] for (size_mb, width, depth) in data_set: cms = CountMinSketch(size_mb, log_counting=log_counting) self.assertEqual(cms.width, width * width_adjustment, "Width for size %d" % size_mb) self.assertEqual(cms.depth, depth, "Depth for size %d" % size_mb) self.assertLessEqual(cms.size(), size_mb * 1024 * 1024) self.assertGreater(cms.size(), size_mb * 1024 * 1024 / 2)
def test_update_with_cms(self): """ Update with a dictionary and test against it using set representation The log variants are only precise up to 2048 (16), so we don't use larger values here """ data1 = {'a': 1, 'b': 3, 'c': 2, 'd': 5} data2 = {'a': 15, 'b': 4, 'c': 6, 'e': 13} expected = {'a': 16, 'b': 7, 'c': 8, 'd': 5, 'e': 13} self.cms.update(data1) cms2 = CountMinSketch(1, log_counting=self.log_counting) cms2.update(data2) self.cms.update(cms2) result_set = set() for key, expected_value in expected.items(): result_set.add((key, self.cms[key])) self.assertEqual(set(result_set), set(expected.items()))
def setUp(self): self.cms = CountMinSketch(1, log_counting=self.log_counting)
class CountMinSketchUpdateCommonTest(unittest.TestCase): def __init__(self, methodName='runTest', log_counting=None): self.log_counting = log_counting super(CountMinSketchUpdateCommonTest, self).__init__(methodName=methodName) """ Functional tests for CountMinSketch.update method, which adds another counter, dictionary, hashtable, tuple or list """ def setUp(self): self.cms = CountMinSketch(1, log_counting=self.log_counting) def test_update_numbers(self): """ Negative test: calling update using numeric values as parameter yields TypeError """ with self.assertRaises(TypeError): self.cms.update(1) with self.assertRaises(TypeError): self.cms.update(1.0) def test_update_string(self): self.cms.update("foo") self.assertEqual(self.cms['f'], 1) self.assertEqual(self.cms['o'], 2) def test_update_tuple(self): tuple = ('foo', 'bar', 'foo') self.cms.update(tuple) self.assertEqual(self.cms['foo'], 2) self.assertEqual(self.cms['bar'], 1) def test_update_bytes(self): tuple = (b'foo', b'bar', b'foo') self.cms.update(tuple) self.assertEqual(self.cms['foo'], 2) self.assertEqual(self.cms[b'foo'], 2) self.assertEqual(self.cms['bar'], 1) def test_update_unicode(self): tuple = ('foo', 'bar', u'foo') self.cms.update(tuple) self.assertEqual(self.cms['foo'], 2) self.assertEqual(self.cms[u'foo'], 2) def test_update_list(self): self.cms.update([str(i % 3) for i in range(5)]) self.assertEqual(self.cms['0'], 2) self.assertEqual(self.cms['1'], 2) self.assertEqual(self.cms['2'], 1) def test_update_split(self): self.cms.update("This is a sentence".split()) self.assertEqual(self.cms['is'], 1) self.assertEqual(self.cms['this'], 0) # lowercase def test_update_twice(self): tuple = ('foo', 'bar', 'foo') self.cms.update(tuple) self.cms.update(('foo', 'bar', 'foo')) self.assertEqual(self.cms['foo'], 4) self.assertEqual(self.cms['bar'], 2) def test_update_with_dictionary(self): """ Update with a dictionary and test against it using set representation """ data = {'a': 1, 'b': 3, 'c': 2, 'd': 5} self.cms.update(data) self.assertEqual(self.cms['b'], 3) result_set = set() for key, expected_value in data.items(): result_set.add((key, self.cms[key])) self.assertEqual(set(result_set), set(data.items())) def test_update_with_cms(self): """ Update with a dictionary and test against it using set representation The log variants are only precise up to 2048 (16), so we don't use larger values here """ data1 = {'a': 1, 'b': 3, 'c': 2, 'd': 5} data2 = {'a': 15, 'b': 4, 'c': 6, 'e': 13} expected = {'a': 16, 'b': 7, 'c': 8, 'd': 5, 'e': 13} self.cms.update(data1) cms2 = CountMinSketch(1, log_counting=self.log_counting) cms2.update(data2) self.cms.update(cms2) result_set = set() for key, expected_value in expected.items(): result_set.add((key, self.cms[key])) self.assertEqual(set(result_set), set(expected.items()))
class CountMinSketchSanityCommonTest(unittest.TestCase): """ Functional tests for setting and retrieving values of the counter """ def __init__(self, methodName='runTest', log_counting=None, delta=0.0): self.log_counting = log_counting self.delta = delta super(CountMinSketchSanityCommonTest, self).__init__(methodName=methodName) def setUp(self): self.cms = CountMinSketch(1, log_counting=self.log_counting) def test_unknown_is_zero(self): self.assertEqual(self.cms['foo'], 0) def test_increment_default(self): self.cms.increment('foo') self.cms.increment('bar') self.cms.increment('foo') self.cms.increment('foo') self.assertEqual(self.cms['foo'], 3) self.assertEqual(self.cms['bar'], 1) def test_increment_bytes(self): self.cms.increment('foo') self.cms.increment('bar') self.cms.increment(b'foo') self.cms.increment('foo') self.assertEqual(self.cms['foo'], 3) self.assertEqual(self.cms[b'foo'], 3) def test_total(self): self.assertEqual(self.cms.total(), 0) self.cms.increment('foo') self.cms.increment('bar') self.cms.increment('foo') self.cms.increment('foo') self.assertEqual(self.cms.total(), 4) self.cms.increment('goo', 3) self.assertEqual(self.cms.total(), 7) def test_cardinality(self): self.assertEqual(self.cms.cardinality(), 0) self.cms.increment('foo') self.cms.increment('bar') self.cms.increment('foo') self.cms.increment('foo') self.assertEqual(self.cms.cardinality(), 2) self.cms.increment('goo', 3) self.assertEqual(self.cms.cardinality(), 3) def test_increment_by_value(self): foo_value = 42 bar_value = 53 self.cms.increment('foo', foo_value) self.cms.increment('bar', bar_value) self.assertAlmostEqual(self.cms['foo'], foo_value, delta=self.delta * foo_value) self.assertAlmostEqual(self.cms['bar'], bar_value, delta=self.delta * bar_value) def test_repeat_increment(self): """ Test that a set successfully replaces existing value of the counter """ self.cms.increment('foo', 5) self.cms.increment('foo', 10) self.assertEqual(self.cms['foo'], 15) def test_increment_int_key(self): """ Negative test: integer keys are not supported and yield TypeError """ with self.assertRaises(TypeError): self.cms.increment(1) def test_get_increment_object_key(self): """ Negative test: object keys are not supported and yield TypeError """ o = MyClass() with self.assertRaises(TypeError): self.cms.increment(o) def test_get_increment_empty_string(self): self.cms.increment('foo', 42) self.cms.increment('bar', 53) self.assertEqual(self.cms[''], 0) self.cms.increment('', 3) self.assertEqual(self.cms[''], 3) self.cms.increment('') self.assertEqual(self.cms[''], 4) def test_get_increment_long_string(self): long_string = 'l' + ('o' * 100) + 'ng' longer_string = 'l' + ('o' * 120) + 'ng' self.cms.increment(long_string, 2) self.cms.increment(longer_string, 3) self.assertEqual(self.cms[long_string], 2) self.assertEqual(self.cms[longer_string], 3) def test_get_increment_non_ascii_string(self): non_ascii_string = "Non-ascii dôverivá Čučoriedka 9#8\\%7 平仮名\n☃\t+☀\t=\t☹ " # the second line contains a different symbol similar_string = "Non-ascii dôverivá Čučoriedka 9#8\\%7 平仮名\n☃\t+☀\t=\t☺ " self.cms.increment(non_ascii_string, 2) self.cms.increment(similar_string, 3) self.assertEqual(self.cms[non_ascii_string], 2) self.assertEqual(self.cms[similar_string], 3) def test_get_increment_non_ascii_unicode(self): non_ascii_unicode = u"Non-ascii dôverivá Čučoriedka 9#8\\%7 平仮名\n☃\t+☀\t=\t☹ " # the second line contains a different symbol similar_unicode = u"Non-ascii dôverivá Čučoriedka 9#8\\%7 平仮名\n☃\t+☀\t=\t☺ " self.cms.increment(non_ascii_unicode, 2) self.cms.increment(similar_unicode, 3) self.assertEqual(self.cms[non_ascii_unicode], 2) self.assertEqual(self.cms[similar_unicode], 3) def test_increment_string_value(self): """ Negative test: string values are not supported and yield TypeError """ with self.assertRaises(TypeError): self.cms.increment('foo', 'bar') def test_set_object_value(self): """ Negative test: object values are not supported and yield TypeError """ class MyClass(object): pass with self.assertRaises(TypeError): self.cms.increment('foo', MyClass()) def test_increment_big_number(self): big_number = 127451 self.cms.increment('big number', big_number) self.assertAlmostEqual(self.cms['big number'], big_number, delta=self.delta * big_number) def test_increment_negative(self): """ Negative test, raises ValueError on negative values """ # new value with self.assertRaises(ValueError): self.cms.increment('foo', -4) self.assertEqual(self.cms['foo'], 0, "value should remain unaffected") self.cms.increment('foo', 3) # existing value with self.assertRaises(ValueError): self.cms.increment('foo', -2) self.assertEqual(self.cms['foo'], 3, "value should remain unaffected") def test_increment_zero(self): """ Setting the zero value """ self.cms.increment('foo', 0) self.assertEqual(self.cms['foo'], 0) self.cms.increment('foo') self.cms.increment('foo', 0) self.assertEqual(self.cms['foo'], 1)
def test_largest_cms(self): cms = CountMinSketch(size_mb=16384, log_counting=8)
def test_invalid_sizemb(self): with self.assertRaises(ValueError): CountMinSketch(0.5)
def test_invalid_algorithm(self): data = ['basic', 'log8', 'cons', 'logcounter', 5] for bad_algorithm in data: with self.assertRaises(ValueError): CountMinSketch(1, log_counting=bad_algorithm)
class CountMinSketchPickleCommonTest(unittest.TestCase): """ Functional tests for determining size (cardinality) of hashtable and iterations. """ def __init__(self, methodName='runTest', log_counting=None): self.log_counting = log_counting super(CountMinSketchPickleCommonTest, self).__init__(methodName=methodName) def setUp(self): self.cms = CountMinSketch(2, log_counting=self.log_counting) def tearDown(self): if os.path.isfile(filename): os.remove(filename) def store_and_load(self): with open(filename, 'wb') as outfile: pickle.dump(self.cms, outfile) with open(filename, 'rb') as outfile: reloaded = pickle.load(outfile) return reloaded def check_cms(self, cms, data): self.assertAlmostEqual(cms.cardinality(), len(data)) self.assertEqual(cms.total(), sum(data.values())) result_set = set() for key, expected_value in data.items(): result_set.add((key, cms[key])) self.assertEqual(result_set, set(data.items())) def test_pickle_empty(self): reloaded = self.store_and_load() self.check_cms(reloaded, {}) def test_pickle_simple(self): expected = Counter() for structure in [self.cms, expected]: structure.update("pickling") structure.update("lorem ipsum dolor amet") structure.update("122333444455555666666") self.check_cms(self.cms, expected) reloaded = self.store_and_load() self.check_cms(reloaded, expected) def test_pickle_increment_after_reload(self): expected = Counter() for structure in [self.cms, expected]: structure.update("pickling") self.cms.increment('1') self.cms.increment('2', 2) expected['1'] += 1 expected['2'] += 2 self.check_cms(self.cms, expected) reloaded = self.store_and_load() for structure in [reloaded, expected]: structure.update("pickling") reloaded.increment('1', 1) reloaded.increment('3', 3) expected['1'] += 1 expected['3'] += 3 self.check_cms(reloaded, expected)