def test_hll_sketch(self): k = 8 n = 117 hll = self.generate_sketch(n, k, tgt_hll_type.HLL_6) hll.update('string data') hll.update(3.14159) # double data self.assertLessEqual(hll.get_lower_bound(1), hll.get_estimate()) self.assertGreaterEqual(hll.get_upper_bound(1), hll.get_estimate()) self.assertEqual(hll.lg_config_k, k) self.assertEqual(hll.tgt_type, tgt_hll_type.HLL_6) bytes_compact = hll.serialize_compact() bytes_update = hll.serialize_updatable() self.assertEqual(len(bytes_compact), hll.get_compact_serialization_bytes()) self.assertEqual(len(bytes_update), hll.get_updatable_serialization_bytes()) self.assertFalse(hll.is_compact()) self.assertFalse(hll.is_empty()) self.assertTrue( isinstance(hll_sketch.deserialize(bytes_compact), hll_sketch)) self.assertTrue( isinstance(hll_sketch.deserialize(bytes_update), hll_sketch)) self.assertIsNotNone(hll_sketch.get_rel_err(True, False, 12, 1)) self.assertIsNotNone( hll_sketch.get_max_updatable_serialization_bytes( 20, tgt_hll_type.HLL_6)) hll.reset() self.assertTrue(hll.is_empty())
def test_hll_example(self): k = 12 # 2^k = 4096 rows in the table n = 1 << 18 # ~256k unique values # create a couple sketches and inject some values # we'll have 1/4 of the values overlap hll = hll_sketch(k, tgt_hll_type.HLL_8) hll2 = hll_sketch(k, tgt_hll_type.HLL_6) offset = int(3 * n / 4) # it's a float w/o cast # because we hash on the bits, not an abstract numeric value, # hll.update(1) and hll.update(1.0) give different results. for i in range(0, n): hll.update(i) hll2.update(i + offset) # although we provide get_composite_estimate() and get_estimate(), # the latter will always give the best available estimate. we # recommend using get_estimate(). # we can check that the upper and lower bounds bracket the # estimate, without needing to know the exact value. self.assertLessEqual(hll.get_lower_bound(1), hll.get_estimate()) self.assertGreaterEqual(hll.get_upper_bound(1), hll.get_estimate()) # unioning uses a separate class, and we can either get a result # sketch or query the union object directly union = hll_union(k) union.update(hll) union.update(hll2) result = union.get_result() self.assertEqual(result.get_estimate(), union.get_estimate()) # since our process here (including post-union HLL) is # deterministic, we have checked and know the exact # answer is within one standard deviation of the estimate self.assertLessEqual(union.get_lower_bound(1), 7 * n / 4) self.assertGreaterEqual(union.get_upper_bound(1), 7 * n / 4) # serialize for storage and reconstruct sk_bytes = result.serialize_compact() self.assertEqual(len(sk_bytes), result.get_compact_serialization_bytes()) new_hll = hll_sketch.deserialize(sk_bytes) # the sketch can self-report its configuation and status self.assertEqual(new_hll.lg_config_k, k) self.assertEqual(new_hll.tgt_type, tgt_hll_type.HLL_4) self.assertFalse(new_hll.is_empty()) # if we want to reduce some object overhead, we can also reset new_hll.reset() self.assertTrue(new_hll.is_empty())
def __merge_states(self, states: Sequence[State]) -> State: first_state = states[0] result_state = None if isinstance(first_state, SchemaState): result_state = first_state elif isinstance(first_state, MaxState): max_value: float = first_state.max_value for state in states: max_value = max(max_value, state.max_value) result_state = MaxState(first_state.id, max_value) elif isinstance(first_state, MeanState): total: float = 0 count: int = 0 for state in states: total = total + state.total count = count + state.count result_state = MeanState(first_state.id, total, count) elif isinstance(first_state, MinState): min_value: float = first_state.min_value for state in states: min_value = min(min_value, state.min_value) result_state = MinState(first_state.id, min_value) elif isinstance(first_state, NumMatches): num_matches: int = 0 for state in states: num_matches = num_matches + state.num_matches result_state = NumMatches(first_state.id, num_matches) elif isinstance(first_state, NumMatchesAndCount): num_matches: int = 0 count: int = 0 for state in states: num_matches = num_matches + state.num_matches count = count + state.count result_state = NumMatchesAndCount(first_state.id, num_matches, count) elif isinstance(first_state, QuantileState): if first_state.sketch_type == "floats": kll_ser = kll_floats_sketch(DEFAULT_SKETCH_SIZE) else: kll_ser = kll_ints_sketch(DEFAULT_SKETCH_SIZE) main_kll = kll_ser.deserialize( bytes.fromhex(first_state.serializedKll)) i = 0 for state in states: if i == 0: i += 1 continue new_kll = kll_ser.deserialize( bytes.fromhex(state.serializedKll)) main_kll.merge(new_kll) result_state = QuantileState(first_state.id, main_kll.serialize().hex(), first_state.quantile, first_state.sketch_type) elif isinstance(first_state, ApproxDistinctState): main_hll = hll_sketch.deserialize( bytes.fromhex(first_state.serializedHll)) num_rows = first_state.num_rows i = 0 for state in states: if i == 0: i += 1 continue num_rows = num_rows + state.num_rows new_hll = hll_sketch.deserialize( bytes.fromhex(state.serializedHll)) main_hll.update(new_hll) approx_distinct_count = main_hll.get_estimate() serialized_hll = main_hll.serialize_updatable().hex() result_state = ApproxDistinctState(first_state.id, serialized_hll, approx_distinct_count, num_rows) elif isinstance(first_state, StandardDeviationState): n: float = first_state.n avg: float = first_state.avg m2: float = first_state.m2 stddev: float = first_state.stddev i = 0 for state in states: if i == 0: i += 1 continue n = n + state.n avg = (state.n * state.avg + n * avg) / n delta = state.avg - avg m2 = state.m2 + m2 + delta * delta * state.n * n / n stddev = (m2 / (n - 1)) if n > 1 else 0 result_state = StandardDeviationState(first_state.id, n, avg, m2, stddev) elif isinstance(first_state, SumState): sum_value: float = 0 for state in states: sum_value = sum_value + state.sum_value result_state = SumState(first_state.id, sum_value) elif isinstance(first_state, FrequenciesAndNumRows): raise NotImplementedError( "Merging of FrequenciesAndNumRows states not implemented, yet") #frequencies_table: str #grouping_columns: List[str] #num_rows: int #def get_table_name(self) -> str: # return self.frequencies_table return result_state