예제 #1
0
    def test_hll_sketch(self):
        k = 8
        n = 117
        hll = self.generate_sketch(n, k, tgt_hll_type.HLL_6)
        hll.update('string data')
        hll.update(3.14159)  # double data

        self.assertLessEqual(hll.get_lower_bound(1), hll.get_estimate())
        self.assertGreaterEqual(hll.get_upper_bound(1), hll.get_estimate())

        self.assertEqual(hll.lg_config_k, k)
        self.assertEqual(hll.tgt_type, tgt_hll_type.HLL_6)

        bytes_compact = hll.serialize_compact()
        bytes_update = hll.serialize_updatable()
        self.assertEqual(len(bytes_compact),
                         hll.get_compact_serialization_bytes())
        self.assertEqual(len(bytes_update),
                         hll.get_updatable_serialization_bytes())

        self.assertFalse(hll.is_compact())
        self.assertFalse(hll.is_empty())

        self.assertTrue(
            isinstance(hll_sketch.deserialize(bytes_compact), hll_sketch))
        self.assertTrue(
            isinstance(hll_sketch.deserialize(bytes_update), hll_sketch))

        self.assertIsNotNone(hll_sketch.get_rel_err(True, False, 12, 1))
        self.assertIsNotNone(
            hll_sketch.get_max_updatable_serialization_bytes(
                20, tgt_hll_type.HLL_6))

        hll.reset()
        self.assertTrue(hll.is_empty())
예제 #2
0
    def test_hll_example(self):
        k = 12      # 2^k = 4096 rows in the table
        n = 1 << 18 # ~256k unique values

        # create a couple sketches and inject some values
        # we'll have 1/4 of the values overlap
        hll  = hll_sketch(k, tgt_hll_type.HLL_8)
        hll2 = hll_sketch(k, tgt_hll_type.HLL_6)
        offset = int(3 * n / 4) # it's a float w/o cast
        # because we hash on the bits, not an abstract numeric value,
        # hll.update(1) and hll.update(1.0) give different results.
        for i in range(0, n):
            hll.update(i)
            hll2.update(i + offset)
        
        # although we provide get_composite_estimate() and get_estimate(),
        # the latter will always give the best available estimate.  we
        # recommend using get_estimate().
        # we can check that the upper and lower bounds bracket the
        # estimate, without needing to know the exact value.
        self.assertLessEqual(hll.get_lower_bound(1), hll.get_estimate())
        self.assertGreaterEqual(hll.get_upper_bound(1), hll.get_estimate())

        # unioning uses a separate class, and we can either get a result
        # sketch or query the union object directly
        union = hll_union(k)
        union.update(hll)
        union.update(hll2)
        result = union.get_result()
        self.assertEqual(result.get_estimate(), union.get_estimate())

        # since our process here (including post-union HLL) is
        # deterministic, we have checked and know the exact
        # answer is within one standard deviation of the estimate
        self.assertLessEqual(union.get_lower_bound(1), 7 * n / 4)
        self.assertGreaterEqual(union.get_upper_bound(1), 7 * n / 4)

        # serialize for storage and reconstruct
        sk_bytes = result.serialize_compact()
        self.assertEqual(len(sk_bytes), result.get_compact_serialization_bytes())
        new_hll = hll_sketch.deserialize(sk_bytes)

        # the sketch can self-report its configuation and status
        self.assertEqual(new_hll.lg_config_k, k)
        self.assertEqual(new_hll.tgt_type, tgt_hll_type.HLL_4)
        self.assertFalse(new_hll.is_empty())

        # if we want to reduce some object overhead, we can also reset
        new_hll.reset()
        self.assertTrue(new_hll.is_empty())
예제 #3
0
    def __merge_states(self, states: Sequence[State]) -> State:
        first_state = states[0]
        result_state = None
        if isinstance(first_state, SchemaState):
            result_state = first_state
        elif isinstance(first_state, MaxState):
            max_value: float = first_state.max_value
            for state in states:
                max_value = max(max_value, state.max_value)
            result_state = MaxState(first_state.id, max_value)
        elif isinstance(first_state, MeanState):
            total: float = 0
            count: int = 0
            for state in states:
                total = total + state.total
                count = count + state.count
            result_state = MeanState(first_state.id, total, count)
        elif isinstance(first_state, MinState):
            min_value: float = first_state.min_value
            for state in states:
                min_value = min(min_value, state.min_value)
            result_state = MinState(first_state.id, min_value)
        elif isinstance(first_state, NumMatches):
            num_matches: int = 0
            for state in states:
                num_matches = num_matches + state.num_matches
            result_state = NumMatches(first_state.id, num_matches)
        elif isinstance(first_state, NumMatchesAndCount):
            num_matches: int = 0
            count: int = 0
            for state in states:
                num_matches = num_matches + state.num_matches
                count = count + state.count
            result_state = NumMatchesAndCount(first_state.id, num_matches,
                                              count)
        elif isinstance(first_state, QuantileState):
            if first_state.sketch_type == "floats":
                kll_ser = kll_floats_sketch(DEFAULT_SKETCH_SIZE)
            else:
                kll_ser = kll_ints_sketch(DEFAULT_SKETCH_SIZE)
            main_kll = kll_ser.deserialize(
                bytes.fromhex(first_state.serializedKll))

            i = 0
            for state in states:
                if i == 0:
                    i += 1
                    continue
                new_kll = kll_ser.deserialize(
                    bytes.fromhex(state.serializedKll))
                main_kll.merge(new_kll)

            result_state = QuantileState(first_state.id,
                                         main_kll.serialize().hex(),
                                         first_state.quantile,
                                         first_state.sketch_type)
        elif isinstance(first_state, ApproxDistinctState):
            main_hll = hll_sketch.deserialize(
                bytes.fromhex(first_state.serializedHll))
            num_rows = first_state.num_rows
            i = 0
            for state in states:
                if i == 0:
                    i += 1
                    continue
                num_rows = num_rows + state.num_rows
                new_hll = hll_sketch.deserialize(
                    bytes.fromhex(state.serializedHll))
                main_hll.update(new_hll)
            approx_distinct_count = main_hll.get_estimate()
            serialized_hll = main_hll.serialize_updatable().hex()
            result_state = ApproxDistinctState(first_state.id, serialized_hll,
                                               approx_distinct_count, num_rows)
        elif isinstance(first_state, StandardDeviationState):
            n: float = first_state.n
            avg: float = first_state.avg
            m2: float = first_state.m2
            stddev: float = first_state.stddev
            i = 0
            for state in states:
                if i == 0:
                    i += 1
                    continue
                n = n + state.n
                avg = (state.n * state.avg + n * avg) / n
                delta = state.avg - avg
                m2 = state.m2 + m2 + delta * delta * state.n * n / n
                stddev = (m2 / (n - 1)) if n > 1 else 0
            result_state = StandardDeviationState(first_state.id, n, avg, m2,
                                                  stddev)
        elif isinstance(first_state, SumState):
            sum_value: float = 0
            for state in states:
                sum_value = sum_value + state.sum_value
            result_state = SumState(first_state.id, sum_value)
        elif isinstance(first_state, FrequenciesAndNumRows):
            raise NotImplementedError(
                "Merging of FrequenciesAndNumRows states not implemented, yet")
            #frequencies_table: str
            #grouping_columns: List[str]
            #num_rows: int
            #def get_table_name(self) -> str:
            #    return self.frequencies_table

        return result_state