def test_sketch_roundtrip_serialize(): sketch = make_sketch_and_track(ALL_VALS) msg = sketch.serialize() assert len(msg) == sketch.get_serialized_size_bytes() round_trip = dsketch.FrequentItemsSketch.deserialize(msg) compare_frequent_items(round_trip.get_frequent_items(), sketch.get_frequent_items())
def test_frequent_items_correct(number_sketch): items = number_sketch.get_frequent_items() true = [ (1, 3, 3, 3), (4, 2, 2, 2), (2, 1, 1, 1), (3, 1, 1, 1), (5.0, 1, 1, 1), ] compare_frequent_items(true, items)
def test_merge_gives_correct_values(): sketch = make_sketch_and_track(ALL_VALS) merged = sketch.merge(sketch.copy()) items = sketch.get_frequent_items() merged_items = merged.get_frequent_items() # All counts should just be doubled! expected = [] for item in items: new_item = (item[0], 2 * item[1], 2 * item[2], 2 * item[3]) expected.append(new_item) compare_frequent_items(expected, merged_items)
def test_merge_empty_gives_same_result(): # Make sketches sketch = make_sketch_and_track(ALL_VALS) empty = dsketch.FrequentItemsSketch() items = sketch.get_frequent_items() # Merge empty into full merged = sketch.merge(empty) compare_frequent_items(items, merged.get_frequent_items()) # Merge full into empty merged = empty.merge(sketch) compare_frequent_items(items, merged.get_frequent_items())
def test_protobuf(): c = ColumnProfile('col') for val in [1, 2, 3]: c.track(val) msg = c.to_protobuf() c1 = ColumnProfile.from_protobuf(msg) assert c1.column_name == c.column_name == 'col' assert hasattr(c1, 'number_tracker') msg2 = c1.to_protobuf() # We cannot do a straight equality comparison for serialized frequent # strings objects compare_frequent_items( c1.number_tracker.frequent_numbers.get_frequent_items(), c.number_tracker.frequent_numbers.get_frequent_items()) msg.numbers.frequent_numbers.sketch = bytes() msg2.numbers.frequent_numbers.sketch = bytes()
def test_frequent_items_correct(): sketch = make_sketch_and_track(ALL_VALS) items = sketch.get_frequent_items() true_items = [ (1, 3, 3, 3), (2, 1, 1, 1), (3, 1, 1, 1), (4, 2, 2, 2), (5.0, 2, 2, 2), (4.0, 1, 1, 1), (1.0e90, 1, 1, 1), (True, 2, 2, 2), (False, 1, 1, 1), ("a", 3, 3, 3), ("b", 2, 2, 2), ("hello world", 1, 1, 1), ("hello World", 1, 1, 1), ] compare_frequent_items(true_items, items)
def test_merge(): x = NumberTracker() for v in [10, 11, 13]: x.track(v) merged = x.merge(x) assert merged.ints.count == 6 assert merged.floats.count == 0 assert merged.histogram.get_n() == 6 assert merged.histogram.get_max_value() == 13.0 assert merged.histogram.get_min_value() == 10.0 expected_freq = [ (10, 2, 2, 2), (11, 2, 2, 2), (13, 2, 2, 2), ] compare_frequent_items(expected_freq, merged.frequent_numbers.get_frequent_items()) msg = merged.to_protobuf() NumberTracker.from_protobuf(msg)
def test_copy_gives_same_results(): sketch = make_sketch_and_track(ALL_VALS) copy = sketch.copy() compare_frequent_items(sketch.get_frequent_items(), copy.get_frequent_items())
def test_protobuf_roundtrip(): sketch = make_sketch_and_track(ALL_VALS) msg = sketch.to_protobuf() sketch2 = dsketch.FrequentItemsSketch.from_protobuf(msg) compare_frequent_items(sketch.get_frequent_items(), sketch2.get_frequent_items())