def calculate_sketch_statistics(data): columns = list(data.columns) types = list(data.dtypes) stats_dict = {} for column, type in zip(columns, types): if type in [np.int32, np.int64, np.float64]: data_col = data[column].to_numpy() if data[column].dtype in [np.int32, np.int64]: kll = kll_ints_sketch(2048) elif data[column].dtype == np.float64: kll = kll_floats_sketch(2048) kll.update(data_col) stat_values = kll.get_quantiles([0.05, 0.25, 0.5, 0.75, 0.95]) stat_names = ["0.05", "Q1", "Median", "Q3", "0.95"] hll = hll_sketch(DEFAULT_HLL_K, DEFAULT_HLL_TYPE) hll.update(data_col) #works with local fork (np.array extension) approx_distinct_count = hll.get_estimate() stat_values.append(round(approx_distinct_count)) stat_names.append("Distinct Count") stat_pairs = [list(i) for i in zip(stat_names, stat_values)] stats_dict[column] = stat_pairs return stats_dict
def deserialize_kll_floats_sketch(x: bytes, kind: str = "float"): """ Deserialize a KLL floats sketch. Compatible with whylogs-java whylogs histograms are serialized as kll floats sketches Parameters ---------- x : bytes Serialized sketch kind : str, optional Specify type of sketch: 'float' or 'int' Returns ------- sketch : `kll_floats_sketch`, `kll_ints_sketch`, or None If `x` is an empty sketch, return None, else return the deserialized sketch. """ if len(x) < 1: return if kind == "float": h = datasketches.kll_floats_sketch.deserialize(x) elif kind == "int": h = datasketches.kll_ints_sketch(x) if h.get_n() < 1: return return h
def test_kll_ints_sketch(self): k = 100 n = 10 kll = kll_ints_sketch(k) for i in range(0, n): kll.update(i) self.assertEqual(kll.get_min_value(), 0) self.assertEqual(kll.get_max_value(), n - 1) self.assertEqual(kll.get_n(), n) self.assertFalse(kll.is_empty()) self.assertFalse(kll.is_estimation_mode()) # n < k pmf = kll.get_pmf([round(n / 2)]) self.assertIsNotNone(pmf) self.assertEqual(len(pmf), 2) cdf = kll.get_cdf([round(n / 2)]) self.assertIsNotNone(cdf) self.assertEqual(len(cdf), 2) self.assertEqual(kll.get_quantile(0.5), round(n / 2)) quants = kll.get_quantiles([0.25, 0.5, 0.75]) self.assertIsNotNone(quants) self.assertEqual(len(quants), 3) self.assertEqual(kll.get_rank(round(n / 2)), 0.5) # merge self kll.merge(kll) self.assertEqual(kll.get_n(), 2 * n) sk_bytes = kll.serialize() self.assertTrue( isinstance(kll_ints_sketch.deserialize(sk_bytes), kll_ints_sketch))
def calculate_sketch_statistics_np(np_arr): columns = np_arr.keys() stats_dict = {} for column in columns: type = np_arr[column].dtype if type in [np.int32, np.int64, np.float64]: data_col = np_arr[column] if type in [np.int32, np.int64]: kll = kll_ints_sketch(2048) elif type == np.float64: kll = kll_floats_sketch(2048) kll.update(data_col) quantiles = kll.get_quantiles([0.05, 0.25, 0.5, 0.75, 0.95]) quantile_names = ["0.05", "Q1", "Median", "Q3", "0.95"] stat_pairs = [list(i) for i in zip(quantile_names, quantiles)] stats_dict[column] = stat_pairs return stats_dict
def metrics_from_states( self, properties_and_states: Dict[Property, State]) -> Dict[Property, Metric]: property_metric_map: Dict[Property, Metric] = {} for prop, state in properties_and_states.items(): if isinstance(prop, Quantile): quantile_state = state #QuantileState(quantile_property.property_identifier(), serialized_kll, quantile) if state.sketch_type == "floats": kll_ser = kll_floats_sketch(DEFAULT_SKETCH_SIZE) else: kll_ser = kll_ints_sketch(DEFAULT_SKETCH_SIZE) main_kll = kll_ser.deserialize( bytes.fromhex(state.serializedKll)) quantile = main_kll.get_quantiles([prop.quantile])[0] quantile_metric = metric_from_value(quantile, prop.name, prop.instance, prop.entity) property_metric_map[prop] = quantile_metric elif isinstance(prop, ApproxDistinctness): approx_distinct_state = state #ApproxDistinctState(approx_distinct_property.property_identifier(), serialized_hll, approx_distinct_count, num_rows) approx_distinctness = min( approx_distinct_state.approx_distinct_count / approx_distinct_state.num_rows, 1.00) approx_distinct_metric = metric_from_value( approx_distinctness, prop.name, prop.instance, prop.entity) property_metric_map[prop] = approx_distinct_metric elif isinstance(prop, Schema): schema_state = state #SchemaState(schema_property.property_identifier(),schema) schema = schema_state.schema schema_metric = metric_from_value(schema, prop.name, prop.instance, prop.entity) property_metric_map[prop] = schema_metric else: operator = SQLOperatorFactory.create_operator(prop) metric = operator.get_metric(state) property_metric_map[prop] = metric return property_metric_map
def compute_metrics(self, properties: Set[Property], repo: MetadataRepository): quantile_properties = [ property for property in properties if isinstance(property, Quantile) ] quantile_metrics: Dict[Property, Metric] = {} for quantile_property in quantile_properties: data_col = self.data[quantile_property.column].to_numpy() sketch_type = "" if self.data[quantile_property.column].dtype == np.int64: kll = kll_ints_sketch(DEFAULT_SKETCH_SIZE) sketch_type = "ints" elif self.data[quantile_property.column].dtype == np.float64: kll = kll_floats_sketch(DEFAULT_SKETCH_SIZE) sketch_type = "floats" else: raise NotImplementedError( f"Data Type {self.data[quantile_property.column].dtype} is not supported for sketches!" ) kll.update(data_col) quantile = kll.get_quantiles([quantile_property.quantile])[0] serialized_kll = kll.serialize().hex() #bytes.fromhex() quantile_state = QuantileState( quantile_property.property_identifier(), serialized_kll, quantile, sketch_type) repo.register_state(quantile_state) quantile_metric = metric_from_value(quantile, quantile_property.name, quantile_property.instance, quantile_property.entity) quantile_metrics[quantile_property] = quantile_metric approx_distinct_properties = [ property for property in properties if isinstance(property, ApproxDistinctness) ] approx_distinct_metrics: Dict[Property, Metric] = {} for approx_distinct_property in approx_distinct_properties: data_col = self.data[approx_distinct_property.column].to_numpy() hll = hll_sketch(DEFAULT_HLL_K, DEFAULT_HLL_TYPE) #for v in data_col: #slow # hll.update(v) hll.update(data_col) #works with local fork (np.array extension) approx_distinct_count = hll.get_estimate() num_rows = len(data_col) serialized_hll = hll.serialize_updatable().hex() #bytes.fromhex() approx_distinct_state = ApproxDistinctState( approx_distinct_property.property_identifier(), serialized_hll, approx_distinct_count, num_rows) repo.register_state(approx_distinct_state) approx_distinctness = min(approx_distinct_count / num_rows, 1.00) approx_distinct_metric = metric_from_value( approx_distinctness, approx_distinct_property.name, approx_distinct_property.instance, approx_distinct_property.entity) approx_distinct_metrics[ approx_distinct_property] = approx_distinct_metric other_properties = [ property for property in properties if (not isinstance(property, Quantile) and not isinstance(property, ApproxDistinctness)) ] metrics = self.engine.compute_metrics(other_properties, repo) metrics.update(quantile_metrics) metrics.update(approx_distinct_metrics) return metrics
def __merge_states(self, states: Sequence[State]) -> State: first_state = states[0] result_state = None if isinstance(first_state, SchemaState): result_state = first_state elif isinstance(first_state, MaxState): max_value: float = first_state.max_value for state in states: max_value = max(max_value, state.max_value) result_state = MaxState(first_state.id, max_value) elif isinstance(first_state, MeanState): total: float = 0 count: int = 0 for state in states: total = total + state.total count = count + state.count result_state = MeanState(first_state.id, total, count) elif isinstance(first_state, MinState): min_value: float = first_state.min_value for state in states: min_value = min(min_value, state.min_value) result_state = MinState(first_state.id, min_value) elif isinstance(first_state, NumMatches): num_matches: int = 0 for state in states: num_matches = num_matches + state.num_matches result_state = NumMatches(first_state.id, num_matches) elif isinstance(first_state, NumMatchesAndCount): num_matches: int = 0 count: int = 0 for state in states: num_matches = num_matches + state.num_matches count = count + state.count result_state = NumMatchesAndCount(first_state.id, num_matches, count) elif isinstance(first_state, QuantileState): if first_state.sketch_type == "floats": kll_ser = kll_floats_sketch(DEFAULT_SKETCH_SIZE) else: kll_ser = kll_ints_sketch(DEFAULT_SKETCH_SIZE) main_kll = kll_ser.deserialize( bytes.fromhex(first_state.serializedKll)) i = 0 for state in states: if i == 0: i += 1 continue new_kll = kll_ser.deserialize( bytes.fromhex(state.serializedKll)) main_kll.merge(new_kll) result_state = QuantileState(first_state.id, main_kll.serialize().hex(), first_state.quantile, first_state.sketch_type) elif isinstance(first_state, ApproxDistinctState): main_hll = hll_sketch.deserialize( bytes.fromhex(first_state.serializedHll)) num_rows = first_state.num_rows i = 0 for state in states: if i == 0: i += 1 continue num_rows = num_rows + state.num_rows new_hll = hll_sketch.deserialize( bytes.fromhex(state.serializedHll)) main_hll.update(new_hll) approx_distinct_count = main_hll.get_estimate() serialized_hll = main_hll.serialize_updatable().hex() result_state = ApproxDistinctState(first_state.id, serialized_hll, approx_distinct_count, num_rows) elif isinstance(first_state, StandardDeviationState): n: float = first_state.n avg: float = first_state.avg m2: float = first_state.m2 stddev: float = first_state.stddev i = 0 for state in states: if i == 0: i += 1 continue n = n + state.n avg = (state.n * state.avg + n * avg) / n delta = state.avg - avg m2 = state.m2 + m2 + delta * delta * state.n * n / n stddev = (m2 / (n - 1)) if n > 1 else 0 result_state = StandardDeviationState(first_state.id, n, avg, m2, stddev) elif isinstance(first_state, SumState): sum_value: float = 0 for state in states: sum_value = sum_value + state.sum_value result_state = SumState(first_state.id, sum_value) elif isinstance(first_state, FrequenciesAndNumRows): raise NotImplementedError( "Merging of FrequenciesAndNumRows states not implemented, yet") #frequencies_table: str #grouping_columns: List[str] #num_rows: int #def get_table_name(self) -> str: # return self.frequencies_table return result_state