def test_histogram_others(self, meta_conf): values = [] for i in range(1, 101): str = "str-{}".format(i) new_values = [str] * i values.extend(new_values) d = {"string_column": values} df = pd.DataFrame(d) stats, histograms = PandasHistograms( df, meta_conf).get_histograms_and_stats() histogram = histograms["string_column"] assert len(histogram[0]) == 50 and len(histogram[1]) == 50 assert histogram[0][0] == 100 and histogram[1][0] == "str-100" assert histogram[0][10] == 90 and histogram[1][10] == "str-90" assert histogram[0][-2] == 52 and histogram[1][-2] == "str-52" assert histogram[0][-1] == sum(range( 1, 52)) and histogram[1][-1] == "_others" stats = stats["string_column"] assert stats["count"] == 5050 == sum(histogram[0]) assert stats["non-null"] == 5050 assert stats["null-count"] == 0 assert stats["distinct"] == 100
def get_value_meta(self, value: SqlOperation, meta_conf): data_schema = {} data_dimensions = None if meta_conf.log_schema: data_schema = {"type": self.type_str, "dtypes": value.dtypes} if meta_conf.log_size: data_dimensions = [value.records_count, value.columns_count] data_schema["shape"] = data_dimensions # todo: size? # currently columns_stats and histogram are not supported columns_stats, histograms = [], {} hist_sys_metrics = None if meta_conf.log_stats and value.dataframe is not None: columns_stats, _ = PandasHistograms( value.dataframe, meta_conf).get_histograms_and_stats() return ValueMeta( value_preview=None, data_dimensions=data_dimensions, query=value.query, data_schema=data_schema, data_hash=str(hash(self.to_signature(value))), columns_stats=columns_stats, histogram_system_metrics=hist_sys_metrics, histograms=histograms, )
def test_numeric_histogram(self, meta_conf): d = {"numeric_column": [1, 3, 3, 1, 5, 1, 5, 5]} df = pd.DataFrame(d) stats, histograms = PandasHistograms( df, meta_conf).get_histograms_and_stats() stats = stats["numeric_column"] assert stats["count"] == 8 assert stats["non-null"] == 8 assert stats["distinct"] == 3 assert stats["min"] == 1 assert stats["max"] == 5
def get_value_meta(self, value, meta_conf): # type: (pd.DataFrame, ValueMetaConf) -> ValueMeta data_schema = {} if meta_conf.log_schema: data_schema.update({ "type": self.type_str, "columns": list(value.columns), "shape": value.shape, "dtypes": {col: str(type_) for col, type_ in value.dtypes.items()}, }) if meta_conf.log_size: data_schema["size.bytes"] = int(value.size) value_preview, data_hash = None, None if meta_conf.log_preview: value_preview = self.to_preview( value, preview_size=meta_conf.get_preview_size()) try: data_hash = fast_hasher.hash( hash_pandas_object(value, index=True).values) except Exception as e: logger.warning( "Could not hash dataframe object %s! Exception: %s", value, e) if meta_conf.log_histograms: start_time = time.time() stats, histograms = PandasHistograms( value, meta_conf).get_histograms_and_stats() hist_sys_metrics = { "histograms_and_stats_calc_time": time.time() - start_time } else: stats, histograms = {}, {} hist_sys_metrics = None return ValueMeta( value_preview=value_preview, data_dimensions=value.shape, data_schema=data_schema, data_hash=data_hash, descriptive_stats=stats, histogram_system_metrics=hist_sys_metrics, histograms=histograms, )
def test_strings_histogram(self, meta_conf): d = { "string_column": ["Hello World!"] * 10 + [None] * 10 + ["Ola Mundo!"] * 15 + ["Shalom Olam!"] * 20 + ["Ola Mundo!"] * 15 } df = pd.DataFrame(d) stats, histograms = PandasHistograms( df, meta_conf).get_histograms_and_stats() histogram = histograms["string_column"] assert histogram[0] == [30, 20, 10] assert histogram[1] == ["Ola Mundo!", "Shalom Olam!", "Hello World!"] stats = stats["string_column"] assert stats["count"] == 60 assert stats["non-null"] == 60 assert stats["null-count"] == 10 assert stats["distinct"] == 4
def test_boolean_histogram(self, meta_conf): d = { "boolean_column": [True] * 10 + [None] * 10 + [False] * 20 + [True] * 20 } df = pd.DataFrame(d) stats, histograms = PandasHistograms( df, meta_conf).get_histograms_and_stats() histogram = histograms["boolean_column"] assert histogram[0] == [30, 20] assert histogram[1] == [True, False] stats = stats["boolean_column"] assert stats["count"] == 50 assert stats["non-null"] == 50 assert stats["null-count"] == 10 assert stats["distinct"] == 3
def get_value_meta(self, value: RedshiftOperation, meta_conf: ValueMetaConf): # currently, histograms are not supported histograms = {} hist_sys_metrics = None dimensions = None if meta_conf.log_size: dimensions = value.schema["shape"] data_schema = None if meta_conf.log_schema: data_schema = value.schema column_stats = {} if meta_conf.log_stats: if value.dataframe is not None: column_stats, _ = PandasHistograms( value.dataframe, meta_conf ).get_histograms_and_stats() else: column_stats = value.column_stats preview = "" if meta_conf.log_preview: preview = value.preview return ValueMeta( value_preview=preview, data_dimensions=dimensions, data_schema=data_schema, data_hash=str(hash(self.to_signature(value))), columns_stats=column_stats, histogram_system_metrics=hist_sys_metrics, histograms=histograms, query=value.query, )
def test_pandas_v0_histograms(): # Tests pandas histograms calculation is stable across Pandas v1 & v0 meta_conf = ValueMetaConf.enabled() stats, histograms = PandasHistograms(diverse_df, meta_conf).get_histograms_and_stats() # fmt: off assert stats == { "bool_column": { "count": 100, "distinct": 3, "freq": 33, "non-null": 65, "null-count": 35, "top": False, "type": "bool", "unique": 2, }, "float_column": { "25%": 2.0, "50%": 5.0, "75%": 7.0, "count": 100, "distinct": 11, "max": 9.0, "mean": 4.7127659574, "min": 0.0, "non-null": 94, "null-count": 6, "std": 2.8572576537, "type": "float64", }, "int_column": { "25%": 2.0, "50%": 5.0, "75%": 7.0, "count": 100, "distinct": 11, "max": 9.0, "mean": 4.8804347826, "min": 0.0, "non-null": 92, "null-count": 8, "std": 2.7449950111, "type": "float64", }, "str_column": { "count": 100, "distinct": 5, "freq": 22, "non-null": 79, "null-count": 21, "top": "foo", "type": "str", "unique": 4, }, } # "str_column" calculation is unstable hence these unpacked assertions assert set(histograms.keys()) == { "bool_column", "float_column", "int_column", "str_column" } assert histograms["bool_column"] == [[35, 33, 32], [None, False, True]] assert histograms["float_column"] == [ [6, 0, 9, 0, 13, 0, 8, 0, 10, 0, 0, 8, 0, 6, 0, 15, 0, 8, 0, 11], [ 0.0, 0.45, 0.9, 1.35, 1.8, 2.25, 2.7, 3.15, 3.6, 4.05, 4.5, 4.95, 5.4, 5.8500000000000005, 6.3, 6.75, 7.2, 7.65, 8.1, 8.55, 9.0 ] ] assert histograms["int_column"] == [ [2, 0, 13, 0, 9, 0, 9, 0, 7, 0, 0, 8, 0, 15, 0, 11, 0, 6, 0, 12], [ 0.0, 0.45, 0.9, 1.35, 1.8, 2.25, 2.7, 3.15, 3.6, 4.05, 4.5, 4.95, 5.4, 5.8500000000000005, 6.3, 6.75, 7.2, 7.65, 8.1, 8.55, 9.0 ] ] assert histograms["str_column"][0] == [22, 21, 20, 20, 17] # "str_column" calculation is unstable assert set(histograms["str_column"][1]) == {"foo", None, "", "baz", "bar"}
def test_pandas_histograms_work_with_NaNs_and_nonseq_index(pandas_data_frame): # Arrange pandas_data_frame = ( pandas_data_frame.drop(columns="Names").set_index( [pd.Index([90, 30, 50, 70, 10])]) # emulate real world DF indices .append([{ "foo": 42 }])) meta_conf = ValueMetaConf.enabled() # Act stats, histograms = PandasHistograms(pandas_data_frame, meta_conf).get_histograms_and_stats() # Assert assert sorted(stats.keys()) == sorted(["Births", "foo"]) # noqa assert sorted(histograms.keys()) == sorted(["Births", "foo"]) # noqa assert stats == { "Births": { "25%": 155.0, "50%": 578.0, "75%": 968.0, "count": 6, "distinct": 6, "max": 973.0, "mean": 550.2, "min": 77.0, "non-null": 5, "null-count": 1, "std": 428.4246724921, "type": "float64", }, "foo": { "25%": 42.0, "50%": 42.0, "75%": 42.0, "count": 6, "distinct": 2, "max": 42.0, "mean": 42.0, "min": 42.0, "non-null": 1, "null-count": 5, "type": "float64", }, } # fmt: off assert histograms == { "Births": [ [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2], [ 77.0, 121.8, 166.6, 211.39999999999998, 256.2, 301.0, 345.79999999999995, 390.59999999999997, 435.4, 480.2, 525.0, 569.8, 614.5999999999999, 659.4, 704.1999999999999, 749.0, 793.8, 838.5999999999999, 883.4, 928.1999999999999, 973.0 ], ], "foo": [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 41.5, 41.55, 41.6, 41.65, 41.7, 41.75, 41.8, 41.85, 41.9, 41.95, 42.0, 42.05, 42.1, 42.15, 42.2, 42.25, 42.3, 42.35, 42.4, 42.45, 42.5 ]] }
def test_pandas_v0_histograms(): # Tests pandas histograms calculation is stable across Pandas v1 & v0 meta_conf = ValueMetaConf.enabled() columns_stats, histograms = PandasHistograms( diverse_df, meta_conf).get_histograms_and_stats() # fmt: off columns_stats == [ ColumnStatsArgs( column_name="bool_column", column_type="bool", records_count=100, distinct_count=3, null_count=35, most_freq_value=False, most_freq_value_count=33, unique_count=2, ), ColumnStatsArgs( column_name="float_column", column_type="float64", records_count=100, distinct_count=11, null_count=6, quartile_1=2.0, quartile_2=5.0, quartile_3=7.0, max_value=9.0, mean_value=4.7127659574, min_value=0.0, std_value=2.8572576537, ), ColumnStatsArgs( column_name="int_column", column_type="float64", records_count=100, distinct_count=11, null_count=8, quartile_1=2.0, quartile_2=5.0, quartile_3=7.0, max_value=9.0, mean_value=4.8804347826, min_value=0.0, std_value=2.7449950111, ), ColumnStatsArgs( column_name="str_column", column_type="str", records_count=100, distinct_count=5, null_count=21, most_freq_value="foo", most_freq_value_count=22, unique_count=4, ), ColumnStatsArgs( column_name="multi_data_types", column_type="str", records_count=100, distinct_count=8, null_count=10, most_freq_value="foo", most_freq_value_count=11, unique_count=18, ), ] # "str_column" calculation is unstable hence these unpacked assertions assert set(histograms.keys()) == { "bool_column", "float_column", "int_column", "str_column", "multi_data_types" } assert histograms["bool_column"] == [[35, 33, 32], [None, False, True]] assert histograms["float_column"] == [ [6, 0, 9, 0, 13, 0, 8, 0, 10, 0, 0, 8, 0, 6, 0, 15, 0, 8, 0, 11], [ 0.0, 0.45, 0.9, 1.35, 1.8, 2.25, 2.7, 3.15, 3.6, 4.05, 4.5, 4.95, 5.4, 5.8500000000000005, 6.3, 6.75, 7.2, 7.65, 8.1, 8.55, 9.0 ] ] assert histograms["int_column"] == [ [2, 0, 13, 0, 9, 0, 9, 0, 7, 0, 0, 8, 0, 15, 0, 11, 0, 6, 0, 12], [ 0.0, 0.45, 0.9, 1.35, 1.8, 2.25, 2.7, 3.15, 3.6, 4.05, 4.5, 4.95, 5.4, 5.8500000000000005, 6.3, 6.75, 7.2, 7.65, 8.1, 8.55, 9.0 ] ] assert histograms["str_column"][0] == [22, 21, 20, 20, 17] # "str_column" calculation is unstable assert set(histograms["str_column"][1]) == {"foo", None, "", "baz", "bar"}