def __init__( self, variance: VarianceTracker = None, floats: FloatTracker = None, ints: IntTracker = None, theta_sketch: ThetaSketch = None, histogram: datasketches.kll_floats_sketch = None, frequent_numbers: dsketch.FrequentNumbersSketch = None, ): # Our own trackers if variance is None: variance = VarianceTracker() if floats is None: floats = FloatTracker() if ints is None: ints = IntTracker() if theta_sketch is None: theta_sketch = ThetaSketch() if histogram is None: histogram = datasketches.kll_floats_sketch(DEFAULT_HIST_K) if frequent_numbers is None: frequent_numbers = dsketch.FrequentNumbersSketch() self.variance = variance self.floats = floats self.ints = ints self.theta_sketch = theta_sketch self.histogram = histogram self.frequent_numbers = frequent_numbers
def calculate_sketch_statistics(data): columns = list(data.columns) types = list(data.dtypes) stats_dict = {} for column, type in zip(columns, types): if type in [np.int32, np.int64, np.float64]: data_col = data[column].to_numpy() if data[column].dtype in [np.int32, np.int64]: kll = kll_ints_sketch(2048) elif data[column].dtype == np.float64: kll = kll_floats_sketch(2048) kll.update(data_col) stat_values = kll.get_quantiles([0.05, 0.25, 0.5, 0.75, 0.95]) stat_names = ["0.05", "Q1", "Median", "Q3", "0.95"] hll = hll_sketch(DEFAULT_HLL_K, DEFAULT_HLL_TYPE) hll.update(data_col) #works with local fork (np.array extension) approx_distinct_count = hll.get_estimate() stat_values.append(round(approx_distinct_count)) stat_names.append("Distinct Count") stat_pairs = [list(i) for i in zip(stat_names, stat_values)] stats_dict[column] = stat_pairs return stats_dict
def test_histogram_summary(): hist = datasketches.kll_floats_sketch(256) vals = [1, 2, 3, 4, 5, 6, 7, 8, 9] vals = [float(v) for v in vals] for val in vals: hist.update(val) summary = summaryconverters.histogram_from_sketch(hist) _hist_summary_check(summary, vals) assert len(summary.counts) > 1
def test_single_value_histogram_summary(): hist = datasketches.kll_floats_sketch(256) vals = 30 * [1] vals = [float(v) for v in vals] for val in vals: hist.update(val) summary = summaryconverters.histogram_from_sketch(hist) _hist_summary_check(summary, vals) assert len(summary.counts) == 1
def test_kll_example(self): from numpy.random import randn k = 160 n = 2**20 # create a sketch and inject ~1 million N(0,1) points kll = kll_floats_sketch(k) for i in range(0, n): kll.update(randn()) # 0 should be near the median self.assertAlmostEqual(0.5, kll.get_rank(0.0), delta=0.02) # the median should be near 0 self.assertAlmostEqual(0.0, kll.get_quantile(0.5), delta=0.02) # we also track the min/max independently from the rest of the data # which lets us know the full observed data range self.assertLessEqual(kll.get_min_value(), kll.get_quantile(0.01)) self.assertLessEqual(0.0, kll.get_rank(kll.get_min_value())) self.assertGreaterEqual(kll.get_max_value(), kll.get_quantile(0.99)) self.assertGreaterEqual(1.0, kll.get_rank(kll.get_max_value())) # we can also extract a list of values at a time, # here the values should give us something close to [-2, -1, 0, 1, 2]. # then get the CDF, which will return something close to # the original values used in get_quantiles() # finally, can check the normalized rank error bound pts = kll.get_quantiles([0.0228, 0.1587, 0.5, 0.8413, 0.9772]) cdf = kll.get_cdf( pts) # include 1.0 at end to account for all probability mass self.assertEqual(len(cdf), len(pts) + 1) err = kll.normalized_rank_error(False) self.assertEqual(err, kll_floats_sketch.get_normalized_rank_error(k, False)) # and a few basic queries about the sketch self.assertFalse(kll.is_empty()) self.assertTrue(kll.is_estimation_mode()) self.assertEqual(kll.get_n(), n) self.assertLess(kll.get_num_retained(), n) # merging itself will double the number of items the sketch has seen kll.merge(kll) self.assertEqual(kll.get_n(), 2 * n) # we can then serialize and reconstruct the sketch kll_bytes = kll.serialize() new_kll = kll.deserialize(kll_bytes) self.assertEqual(kll.get_num_retained(), new_kll.get_num_retained()) self.assertEqual(kll.get_min_value(), new_kll.get_min_value()) self.assertEqual(kll.get_max_value(), new_kll.get_max_value()) self.assertEqual(kll.get_quantile(0.7), new_kll.get_quantile(0.7)) self.assertEqual(kll.get_rank(0.0), new_kll.get_rank(0.0))
def run_step(self, run_number: int, step_size: int, howlong: float) -> ReturnRunStep: assert self.context with self.context as ctx: indices = ctx.table.created.next(step_size) # returns a slice steps = indices_len(indices) if not steps: return self._return_run_step(self.state_blocked, steps_run=0) input_df = ctx.table.data() column = input_df[self.column] column = column.loc[fix_loc(indices)] # if self._kll is None: # self._kll_func = kll_floats_sketch # self._kll = self._kll_func(self._k) kll = self._kll sk = kll_floats_sketch(self._k) # self._kll_func(self._k) sk.update(column) assert kll kll.merge(sk) max_ = kll.get_max_value() min_ = kll.get_min_value() quantiles: Floats = [] splits: Floats = [] pmf: Floats = [] if self.params.quantiles: quantiles = kll.get_quantiles(self.params.quantiles) if self.params.binning: par_bin = self.params.binning if isinstance(par_bin, integer_types): num_splits = par_bin splits = np.linspace(min_, max_, num_splits) pmf = kll.get_pmf(splits[:-1]) elif isinstance(par_bin, Sequence): splits = par_bin pmf = kll.get_pmf(splits) elif isinstance(par_bin, dict): lower_ = par_bin["lower"] upper_ = par_bin["upper"] num_splits = par_bin["n_splits"] splits = np.linspace(lower_, upper_, num_splits) pmf = kll.get_pmf(splits[:-1]) res = dict(max=max_, min=min_, quantiles=quantiles, splits=splits, pmf=pmf) if self.result is None: self.result = PsDict(res) else: self.psdict.update(res) return self._return_run_step(self.next_state(ctx.table), steps)
def test_kll2(self): np.random.seed(42) s = self.scheduler() random = RandomTable(3, rows=10_000, scheduler=s) kll = KLLSketch(column="_1", scheduler=s) kll.params.quantiles = QUANTILES kll.input[0] = random.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = kll.output.result aio.run(s.start()) val = random.result["_1"].value sk = kll_floats_sketch(K) sk.update(val) self.compare(kll.result["quantiles"], sk.get_quantiles(QUANTILES))
def test_kll4(self): np.random.seed(42) s = self.scheduler() random = RandomTable(3, rows=10_000, scheduler=s) kll = KLLSketch(column="_1", scheduler=s) kll.params.binning = SPLITS_SEQ kll.input[0] = random.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = kll.output.result aio.run(s.start()) val = random.result["_1"].value sk = kll_floats_sketch(K) sk.update(val) pmf = sk.get_pmf(SPLITS_SEQ) self.compare(kll.result["pmf"], pmf)
def test_kll(self): np.random.seed(42) s = self.scheduler() random = RandomTable(3, rows=10_000, scheduler=s) kll = KLLSketch(column="_1", scheduler=s) kll.input[0] = random.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = kll.output.result aio.run(s.start()) val = random.result["_1"].value sk = kll_floats_sketch(K) sk.update(val) self.assertAlmostEqual(kll.result["max"], sk.get_max_value()) self.assertAlmostEqual(kll.result["min"], sk.get_min_value()) self.assertEqual(kll.result["quantiles"], []) self.assertEqual(kll.result["splits"], []) self.assertEqual(kll.result["pmf"], [])
def calculate_sketch_statistics_np(np_arr): columns = np_arr.keys() stats_dict = {} for column in columns: type = np_arr[column].dtype if type in [np.int32, np.int64, np.float64]: data_col = np_arr[column] if type in [np.int32, np.int64]: kll = kll_ints_sketch(2048) elif type == np.float64: kll = kll_floats_sketch(2048) kll.update(data_col) quantiles = kll.get_quantiles([0.05, 0.25, 0.5, 0.75, 0.95]) quantile_names = ["0.05", "Q1", "Median", "Q3", "0.95"] stat_pairs = [list(i) for i in zip(quantile_names, quantiles)] stats_dict[column] = stat_pairs return stats_dict
def test_kll5(self): np.random.seed(42) s = self.scheduler() random = RandomTable(3, rows=10_000, scheduler=s) kll = KLLSketch(column="_1", scheduler=s) kll.params.binning = SPLITS_DICT kll.input[0] = random.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = kll.output.result aio.run(s.start()) val = random.result["_1"].value sk = kll_floats_sketch(K) sk.update(val) lower_ = SPLITS_DICT["lower"] upper_ = SPLITS_DICT["upper"] num_splits = SPLITS_DICT["n_splits"] splits = np.linspace(lower_, upper_, num_splits) pmf = sk.get_pmf(splits[:-1]) self.compare(kll.result["pmf"], pmf)
def test_kll3(self): np.random.seed(42) s = self.scheduler() random = RandomTable(3, rows=10_000, scheduler=s) kll = KLLSketch(column="_1", scheduler=s) kll.params.binning = BINS kll.input[0] = random.output.result pr = Print(proc=self.terse, scheduler=s) pr.input[0] = kll.output.result aio.run(s.start()) val = random.result["_1"].value sk = kll_floats_sketch(K) sk.update(val) max_ = sk.get_max_value() min_ = sk.get_min_value() num_splits = BINS splits = np.linspace(min_, max_, num_splits) pmf = sk.get_pmf(splits[:-1]) self.compare(kll.result["pmf"], pmf)
def metrics_from_states( self, properties_and_states: Dict[Property, State]) -> Dict[Property, Metric]: property_metric_map: Dict[Property, Metric] = {} for prop, state in properties_and_states.items(): if isinstance(prop, Quantile): quantile_state = state #QuantileState(quantile_property.property_identifier(), serialized_kll, quantile) if state.sketch_type == "floats": kll_ser = kll_floats_sketch(DEFAULT_SKETCH_SIZE) else: kll_ser = kll_ints_sketch(DEFAULT_SKETCH_SIZE) main_kll = kll_ser.deserialize( bytes.fromhex(state.serializedKll)) quantile = main_kll.get_quantiles([prop.quantile])[0] quantile_metric = metric_from_value(quantile, prop.name, prop.instance, prop.entity) property_metric_map[prop] = quantile_metric elif isinstance(prop, ApproxDistinctness): approx_distinct_state = state #ApproxDistinctState(approx_distinct_property.property_identifier(), serialized_hll, approx_distinct_count, num_rows) approx_distinctness = min( approx_distinct_state.approx_distinct_count / approx_distinct_state.num_rows, 1.00) approx_distinct_metric = metric_from_value( approx_distinctness, prop.name, prop.instance, prop.entity) property_metric_map[prop] = approx_distinct_metric elif isinstance(prop, Schema): schema_state = state #SchemaState(schema_property.property_identifier(),schema) schema = schema_state.schema schema_metric = metric_from_value(schema, prop.name, prop.instance, prop.entity) property_metric_map[prop] = schema_metric else: operator = SQLOperatorFactory.create_operator(prop) metric = operator.get_metric(state) property_metric_map[prop] = metric return property_metric_map
def __init__(self, column: str, k: int = 200, **kwds: Any) -> None: super().__init__(**kwds) self.column: str = column self._k: int = k self._kll: kll_floats_sketch = kll_floats_sketch(k) self.default_step_size: int = 10000
def test_kll_floats_sketch(self): # alraedy tested ints and it's templatized, so just make sure it instantiates properly k = 75 kll = kll_floats_sketch(k) self.assertTrue(kll.is_empty())
def compute_metrics(self, properties: Set[Property], repo: MetadataRepository): quantile_properties = [ property for property in properties if isinstance(property, Quantile) ] quantile_metrics: Dict[Property, Metric] = {} for quantile_property in quantile_properties: data_col = self.data[quantile_property.column].to_numpy() sketch_type = "" if self.data[quantile_property.column].dtype == np.int64: kll = kll_ints_sketch(DEFAULT_SKETCH_SIZE) sketch_type = "ints" elif self.data[quantile_property.column].dtype == np.float64: kll = kll_floats_sketch(DEFAULT_SKETCH_SIZE) sketch_type = "floats" else: raise NotImplementedError( f"Data Type {self.data[quantile_property.column].dtype} is not supported for sketches!" ) kll.update(data_col) quantile = kll.get_quantiles([quantile_property.quantile])[0] serialized_kll = kll.serialize().hex() #bytes.fromhex() quantile_state = QuantileState( quantile_property.property_identifier(), serialized_kll, quantile, sketch_type) repo.register_state(quantile_state) quantile_metric = metric_from_value(quantile, quantile_property.name, quantile_property.instance, quantile_property.entity) quantile_metrics[quantile_property] = quantile_metric approx_distinct_properties = [ property for property in properties if isinstance(property, ApproxDistinctness) ] approx_distinct_metrics: Dict[Property, Metric] = {} for approx_distinct_property in approx_distinct_properties: data_col = self.data[approx_distinct_property.column].to_numpy() hll = hll_sketch(DEFAULT_HLL_K, DEFAULT_HLL_TYPE) #for v in data_col: #slow # hll.update(v) hll.update(data_col) #works with local fork (np.array extension) approx_distinct_count = hll.get_estimate() num_rows = len(data_col) serialized_hll = hll.serialize_updatable().hex() #bytes.fromhex() approx_distinct_state = ApproxDistinctState( approx_distinct_property.property_identifier(), serialized_hll, approx_distinct_count, num_rows) repo.register_state(approx_distinct_state) approx_distinctness = min(approx_distinct_count / num_rows, 1.00) approx_distinct_metric = metric_from_value( approx_distinctness, approx_distinct_property.name, approx_distinct_property.instance, approx_distinct_property.entity) approx_distinct_metrics[ approx_distinct_property] = approx_distinct_metric other_properties = [ property for property in properties if (not isinstance(property, Quantile) and not isinstance(property, ApproxDistinctness)) ] metrics = self.engine.compute_metrics(other_properties, repo) metrics.update(quantile_metrics) metrics.update(approx_distinct_metrics) return metrics
def __merge_states(self, states: Sequence[State]) -> State: first_state = states[0] result_state = None if isinstance(first_state, SchemaState): result_state = first_state elif isinstance(first_state, MaxState): max_value: float = first_state.max_value for state in states: max_value = max(max_value, state.max_value) result_state = MaxState(first_state.id, max_value) elif isinstance(first_state, MeanState): total: float = 0 count: int = 0 for state in states: total = total + state.total count = count + state.count result_state = MeanState(first_state.id, total, count) elif isinstance(first_state, MinState): min_value: float = first_state.min_value for state in states: min_value = min(min_value, state.min_value) result_state = MinState(first_state.id, min_value) elif isinstance(first_state, NumMatches): num_matches: int = 0 for state in states: num_matches = num_matches + state.num_matches result_state = NumMatches(first_state.id, num_matches) elif isinstance(first_state, NumMatchesAndCount): num_matches: int = 0 count: int = 0 for state in states: num_matches = num_matches + state.num_matches count = count + state.count result_state = NumMatchesAndCount(first_state.id, num_matches, count) elif isinstance(first_state, QuantileState): if first_state.sketch_type == "floats": kll_ser = kll_floats_sketch(DEFAULT_SKETCH_SIZE) else: kll_ser = kll_ints_sketch(DEFAULT_SKETCH_SIZE) main_kll = kll_ser.deserialize( bytes.fromhex(first_state.serializedKll)) i = 0 for state in states: if i == 0: i += 1 continue new_kll = kll_ser.deserialize( bytes.fromhex(state.serializedKll)) main_kll.merge(new_kll) result_state = QuantileState(first_state.id, main_kll.serialize().hex(), first_state.quantile, first_state.sketch_type) elif isinstance(first_state, ApproxDistinctState): main_hll = hll_sketch.deserialize( bytes.fromhex(first_state.serializedHll)) num_rows = first_state.num_rows i = 0 for state in states: if i == 0: i += 1 continue num_rows = num_rows + state.num_rows new_hll = hll_sketch.deserialize( bytes.fromhex(state.serializedHll)) main_hll.update(new_hll) approx_distinct_count = main_hll.get_estimate() serialized_hll = main_hll.serialize_updatable().hex() result_state = ApproxDistinctState(first_state.id, serialized_hll, approx_distinct_count, num_rows) elif isinstance(first_state, StandardDeviationState): n: float = first_state.n avg: float = first_state.avg m2: float = first_state.m2 stddev: float = first_state.stddev i = 0 for state in states: if i == 0: i += 1 continue n = n + state.n avg = (state.n * state.avg + n * avg) / n delta = state.avg - avg m2 = state.m2 + m2 + delta * delta * state.n * n / n stddev = (m2 / (n - 1)) if n > 1 else 0 result_state = StandardDeviationState(first_state.id, n, avg, m2, stddev) elif isinstance(first_state, SumState): sum_value: float = 0 for state in states: sum_value = sum_value + state.sum_value result_state = SumState(first_state.id, sum_value) elif isinstance(first_state, FrequenciesAndNumRows): raise NotImplementedError( "Merging of FrequenciesAndNumRows states not implemented, yet") #frequencies_table: str #grouping_columns: List[str] #num_rows: int #def get_table_name(self) -> str: # return self.frequencies_table return result_state
def reset(self) -> None: if self.result is not None: self.psdict.clear() self._kll = kll_floats_sketch(self._k)