def test_task_metrics_simple(self, tmpdir, pandas_data_frame): metrics_folder = target(str(tmpdir)) task_run = Mock() task_run.meta_files = TaskRunMetaFiles(metrics_folder) t = FileTrackingStore() tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t) tr_tracker.settings.features.get_value_meta_conf = Mock( return_value=ValueMetaConf.enabled()) tr_tracker.log_metric("a", 1) tr_tracker.log_metric("a_string", "1") tr_tracker.log_metric("a_list", [1, 3]) tr_tracker.log_metric("a_tuple", (1, 2)) tr_tracker.log_dataframe("df", pandas_data_frame, meta_conf=ValueMetaConf.enabled()) actual = TaskRunMetricsFileStoreReader( metrics_folder).get_all_metrics_values() print(actual) assert "df.schema" in actual del actual["df.schema"] assert actual == { "a": 1.0, "a_list": "[1, 3]", "a_string": 1.0, "a_tuple": "(1, 2)", "df.preview": "Names Births", "df.shape": "(5, 2)", "df.shape_0_": 5.0, "df.shape_1_": 2.0, }
def test_task_metrics_histograms(self, tmpdir, pandas_data_frame): metrics_folder = target(str(tmpdir)) task_run = Mock() task_run.meta_files = TaskRunMetaFiles(metrics_folder) t = FileTrackingStore() tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t) tr_tracker.settings.tracking.get_value_meta_conf = Mock( return_value=ValueMetaConf.enabled() ) tr_tracker.log_data("df", pandas_data_frame, meta_conf=ValueMetaConf.enabled()) hist_metrics = TaskRunMetricsFileStoreReader( metrics_folder ).get_all_metrics_values(MetricSource.histograms) expected_preview = ( " Names Births Married\n" " Bob 968 True\n" " Jessica 155 False\n" " Mary 77 True\n" " John 578 False\n" " Mel 973 True" ) # std value varies in different py versions due to float precision fluctuation df_births_std = hist_metrics["df.Births.std"] assert df_births_std == pytest.approx(428.4246)
def test_get_value_meta(self, snowflake_table): # Arrange with mock.patch( "dbnd_snowflake.snowflake_values.SnowflakeController", new_callable=snowflake_controller_mock, ) as snowflake: # Act value_meta = SnowflakeTableValueType().get_value_meta( snowflake_table, meta_conf=(ValueMetaConf.enabled())) # Assert assert value_meta.value_preview == "test preview" assert value_meta.data_dimensions == [42, 12] assert value_meta.data_schema == { "type": "SnowflakeTable", "column_types": { "name": "varchar" }, "size": "500 B", } assert ( value_meta.data_hash == "snowflake://*****:*****@SNOWFLAKE_ACCOUNT/SNOWFLAKE_SAMPLE_DATA.TPCDS_SF100TCL/CUSTOMER" ) assert snowflake.get_column_types.called assert snowflake.get_dimensions.called assert snowflake.to_preview.called
def test_df_value_meta(self, pandas_data_frame): expected_data_schema = { "type": DataFrameValueType.type_str, "columns": list(pandas_data_frame.columns), "size": int(pandas_data_frame.size), "shape": pandas_data_frame.shape, "dtypes": { col: str(type_) for col, type_ in pandas_data_frame.dtypes.items() }, } meta_conf = ValueMetaConf.enabled() expected_value_meta = ValueMeta( value_preview=DataFrameValueType().to_preview( pandas_data_frame, preview_size=meta_conf.get_preview_size()), data_dimensions=pandas_data_frame.shape, data_schema=expected_data_schema, data_hash=fast_hasher.hash( hash_pandas_object(pandas_data_frame, index=True).values), ) df_value_meta = DataFrameValueType().get_value_meta( pandas_data_frame, meta_conf=meta_conf) assert df_value_meta.value_preview == expected_value_meta.value_preview assert df_value_meta.data_hash == expected_value_meta.data_hash assert json_utils.dumps(df_value_meta.data_schema) == json_utils.dumps( expected_value_meta.data_schema) assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions assert df_value_meta == expected_value_meta
def test_task_metrics_simple(self, tmpdir, pandas_data_frame): metrics_folder = target(str(tmpdir)) task_run = Mock() task_run.meta_files = TaskRunMetaFiles(metrics_folder) t = FileTrackingStore() tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t) tr_tracker.settings.features.get_value_meta_conf = Mock( return_value=ValueMetaConf.enabled() ) tr_tracker.log_metric("a", 1) tr_tracker.log_metric("a_string", "1") tr_tracker.log_metric("a_list", [1, 3]) tr_tracker.log_metric("a_tuple", (1, 2)) user_metrics = TaskRunMetricsFileStoreReader( metrics_folder ).get_all_metrics_values(MetricSource.user) assert user_metrics == { "a": 1.0, "a_list": "[1, 3]", "a_string": 1.0, "a_tuple": "(1, 2)", }
def test_spark_df_value_meta(self, spark_data_frame): expected_data_schema = { "type": SparkDataFrameValueType.type_str, "columns": list(spark_data_frame.schema.names), "size": int(spark_data_frame.count() * len(spark_data_frame.columns)), "shape": (spark_data_frame.count(), len(spark_data_frame.columns)), "dtypes": {f.name: str(f.dataType) for f in spark_data_frame.schema.fields}, } meta_conf = ValueMetaConf.enabled() expected_value_meta = ValueMeta( value_preview=SparkDataFrameValueType().to_preview( spark_data_frame, meta_conf.get_preview_size()), data_dimensions=(spark_data_frame.count(), len(spark_data_frame.columns)), data_schema=expected_data_schema, data_hash=None, ) df_value_meta = SparkDataFrameValueType().get_value_meta( spark_data_frame) assert df_value_meta.value_preview == expected_value_meta.value_preview assert df_value_meta.data_hash == expected_value_meta.data_hash assert df_value_meta.data_schema == expected_value_meta.data_schema assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions assert df_value_meta == expected_value_meta
def test_str_value_meta(self): str_value_meta = StrValueType().get_value_meta("foo", ValueMetaConf.enabled()) expected_value_meta = ValueMeta( value_preview="foo", data_dimensions=None, data_schema={"type": "str"}, data_hash=fast_hasher.hash("foo"), ) assert str_value_meta == expected_value_meta
def test_spark_df_value_meta(self, spark_data_frame, spark_data_frame_histograms, spark_data_frame_stats): expected_data_schema = { "type": SparkDataFrameValueType.type_str, "columns": list(spark_data_frame.schema.names), "size.bytes": int(spark_data_frame.count() * len(spark_data_frame.columns)), "shape": (spark_data_frame.count(), len(spark_data_frame.columns)), "dtypes": {f.name: str(f.dataType) for f in spark_data_frame.schema.fields}, } expected_hist_sys_metrics = { "boolean_histograms_and_stats_calc_time", "histograms_and_stats_calc_time", "numeric_histograms_and_stats_calc_time", "string_histograms_and_stats_calc_time", } meta_conf = ValueMetaConf.enabled() expected_value_meta = ValueMeta( value_preview=SparkDataFrameValueType().to_preview( spark_data_frame, meta_conf.get_preview_size()), data_dimensions=(spark_data_frame.count(), len(spark_data_frame.columns)), data_hash=SparkDataFrameValueType().to_signature(spark_data_frame), data_schema=expected_data_schema, descriptive_stats=spark_data_frame_stats, histograms=spark_data_frame_histograms, ) df_value_meta = SparkDataFrameValueType().get_value_meta( spark_data_frame, meta_conf) assert df_value_meta.value_preview == expected_value_meta.value_preview assert df_value_meta.data_hash == expected_value_meta.data_hash assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions assert df_value_meta.data_schema == expected_value_meta.data_schema # it changes all the time, it has different formats, and it's already tested in histogram tests # assert df_value_meta.descriptive_stats == expected_value_meta.descriptive_stats # histogram_system_metrics values are too dynamic, so checking only keys, but not values assert (set(df_value_meta.histogram_system_metrics.keys()) == expected_hist_sys_metrics) df_value_meta.histogram_system_metrics = None # assert df_value_meta.histograms == expected_value_meta.histograms # assert attr.asdict(df_value_meta) == attr.asdict(expected_value_meta) pandas_data_frame = spark_data_frame.toPandas() pandas_value_meta = DataFrameValueType().get_value_meta( pandas_data_frame, meta_conf)
def test_df_value_meta( self, pandas_data_frame, pandas_data_frame_histograms, pandas_data_frame_stats ): expected_data_schema = { "type": DataFrameValueType.type_str, "columns": list(pandas_data_frame.columns), "size": int(pandas_data_frame.size), "shape": pandas_data_frame.shape, "dtypes": { col: str(type_) for col, type_ in pandas_data_frame.dtypes.items() }, } meta_conf = ValueMetaConf.enabled() expected_value_meta = ValueMeta( value_preview=DataFrameValueType().to_preview( pandas_data_frame, preview_size=meta_conf.get_preview_size() ), data_dimensions=pandas_data_frame.shape, data_schema=expected_data_schema, data_hash=fast_hasher.hash( hash_pandas_object(pandas_data_frame, index=True).values ), descriptive_stats=pandas_data_frame_stats, histograms=pandas_data_frame_histograms, ) df_value_meta = DataFrameValueType().get_value_meta( pandas_data_frame, meta_conf=meta_conf ) assert df_value_meta.value_preview == expected_value_meta.value_preview assert df_value_meta.data_hash == expected_value_meta.data_hash assert json_utils.dumps(df_value_meta.data_schema) == json_utils.dumps( expected_value_meta.data_schema ) assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions std = df_value_meta.descriptive_stats["Births"].pop("std") expected_std = expected_value_meta.descriptive_stats["Births"].pop("std") assert round(std, 2) == expected_std df_value_meta.descriptive_stats["Names"].pop("top") assert df_value_meta.descriptive_stats == expected_value_meta.descriptive_stats counts, values = df_value_meta.histograms.pop("Names") expected_counts, expected_values = expected_value_meta.histograms.pop("Names") assert counts == expected_counts assert set(values) == set(expected_values) # order changes in each run # histograms are tested in histogram tests and they change a lot, no need to test also here df_value_meta.histograms = expected_value_meta.histograms = None expected_value_meta.histogram_system_metrics = ( df_value_meta.histogram_system_metrics ) assert df_value_meta.data_schema == expected_value_meta.data_schema assert attr.asdict(df_value_meta) == attr.asdict(expected_value_meta)
def test_target_value_meta(self): v = target("a") meta_conf = ValueMetaConf.enabled() target_value_meta = TargetPathLibValueType().get_value_meta( v, meta_conf=meta_conf) expected_value_meta = ValueMeta( value_preview='"a"', data_dimensions=None, data_schema={"type": "Path"}, data_hash=fast_hasher.hash(v), ) assert target_value_meta == expected_value_meta
def test_get_histograms_and_stats(self): with mock.patch( "dbnd_postgres.postgres_values.PostgresController._query" ) as query_patch: # Arrange pg_stats_data = [{ "attname": "customer", "null_frac": 0.5, "n_distinct": 8, "most_common_vals": "{customerA, customerB}", "most_common_freqs": [0.2, 0.2], }] pg_class_data = [{"reltuples": 10}] information_schema_columns_data = [{ "column_name": "customer", "data_type": "varchar" }] query_patch.side_effect = [ pg_stats_data, pg_class_data, information_schema_columns_data, ] expected_columns_stats = [ ColumnStatsArgs( column_name="customer", column_type="varchar", records_count=10, distinct_count=8, null_count=5, ) ] expected_histograms = { "customer": ([2, 2, 1], ["customerA", "customerB", "_others"]) } # Act postgres = PostgresController("user@database", "data_table") meta_conf = ValueMetaConf.enabled() columns_stats, histograms = postgres.get_histograms_and_stats( meta_conf) # Assert assert columns_stats == expected_columns_stats assert histograms == expected_histograms
def test_get_value_meta(self, snowflake_table): value_meta = SnowflakeTableValueType().get_value_meta( snowflake_table, meta_conf=(ValueMetaConf.enabled())) # Assert assert value_meta.value_preview == "test preview" assert value_meta.data_dimensions == [42, 12] assert value_meta.data_schema == { "type": "SnowflakeTable", "column_types": { "name": "varchar" }, "size.bytes": 500, } assert value_meta.data_hash == EXPECTED_SNOWFLAKE_TABLE_SIGNATURE assert snowflake_table.snowflake_ctrl.get_column_types.called assert snowflake_table.snowflake_ctrl.get_dimensions.called assert snowflake_table.snowflake_ctrl.to_preview.called
def test_df_value_meta(self): # Arrange postgres_table = PostgresTable(table_name="test_table", connection_string="*****@*****.**") meta_conf = ValueMetaConf.enabled() with mock.patch( "dbnd_postgres.postgres_values.PostgresController", new_callable=postgres_controller_mock, ) as postgres: # Act PostgresTableValueType().get_value_meta(postgres_table, meta_conf=meta_conf) # Assert assert postgres.columns_types.called assert postgres.get_histograms_and_stats.called assert postgres.to_preview.called
def test_df_value_meta(self, pandas_data_frame): expected_data_schema = { "type": DataFrameValueType.type_str, "columns": list(pandas_data_frame.columns), "size.bytes": int(pandas_data_frame.size), "shape": pandas_data_frame.shape, "dtypes": { col: str(type_) for col, type_ in pandas_data_frame.dtypes.items() }, } meta_conf = ValueMetaConf.enabled() expected_value_meta = ValueMeta( value_preview=DataFrameValueType().to_preview( pandas_data_frame, preview_size=meta_conf.get_preview_size()), data_dimensions=pandas_data_frame.shape, data_schema=expected_data_schema, data_hash=fast_hasher.hash( hash_pandas_object(pandas_data_frame, index=True).values), ) df_value_meta = DataFrameValueType().get_value_meta( pandas_data_frame, meta_conf=meta_conf) assert df_value_meta.value_preview == expected_value_meta.value_preview assert df_value_meta.data_hash == expected_value_meta.data_hash assert df_value_meta.data_schema == expected_value_meta.data_schema assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions assert df_value_meta.data_schema == expected_value_meta.data_schema # histograms and stats are tested in histogram tests and they change a lot, no need to test also here assert set([ col_stats.column_name for col_stats in df_value_meta.columns_stats ]) == {"Names", "Births"} assert set(df_value_meta.histograms.keys()) == {"Names", "Births"}
def test_pandas_v0_histograms(): # Tests pandas histograms calculation is stable across Pandas v1 & v0 meta_conf = ValueMetaConf.enabled() columns_stats, histograms = PandasHistograms( diverse_df, meta_conf).get_histograms_and_stats() # fmt: off columns_stats == [ ColumnStatsArgs( column_name="bool_column", column_type="bool", records_count=100, distinct_count=3, null_count=35, most_freq_value=False, most_freq_value_count=33, unique_count=2, ), ColumnStatsArgs( column_name="float_column", column_type="float64", records_count=100, distinct_count=11, null_count=6, quartile_1=2.0, quartile_2=5.0, quartile_3=7.0, max_value=9.0, mean_value=4.7127659574, min_value=0.0, std_value=2.8572576537, ), ColumnStatsArgs( column_name="int_column", column_type="float64", records_count=100, distinct_count=11, null_count=8, quartile_1=2.0, quartile_2=5.0, quartile_3=7.0, max_value=9.0, mean_value=4.8804347826, min_value=0.0, std_value=2.7449950111, ), ColumnStatsArgs( column_name="str_column", column_type="str", records_count=100, distinct_count=5, null_count=21, most_freq_value="foo", most_freq_value_count=22, unique_count=4, ), ColumnStatsArgs( column_name="multi_data_types", column_type="str", records_count=100, distinct_count=8, null_count=10, most_freq_value="foo", most_freq_value_count=11, unique_count=18, ), ] # "str_column" calculation is unstable hence these unpacked assertions assert set(histograms.keys()) == { "bool_column", "float_column", "int_column", "str_column", "multi_data_types" } assert histograms["bool_column"] == [[35, 33, 32], [None, False, True]] assert histograms["float_column"] == [ [6, 0, 9, 0, 13, 0, 8, 0, 10, 0, 0, 8, 0, 6, 0, 15, 0, 8, 0, 11], [ 0.0, 0.45, 0.9, 1.35, 1.8, 2.25, 2.7, 3.15, 3.6, 4.05, 4.5, 4.95, 5.4, 5.8500000000000005, 6.3, 6.75, 7.2, 7.65, 8.1, 8.55, 9.0 ] ] assert histograms["int_column"] == [ [2, 0, 13, 0, 9, 0, 9, 0, 7, 0, 0, 8, 0, 15, 0, 11, 0, 6, 0, 12], [ 0.0, 0.45, 0.9, 1.35, 1.8, 2.25, 2.7, 3.15, 3.6, 4.05, 4.5, 4.95, 5.4, 5.8500000000000005, 6.3, 6.75, 7.2, 7.65, 8.1, 8.55, 9.0 ] ] assert histograms["str_column"][0] == [22, 21, 20, 20, 17] # "str_column" calculation is unstable assert set(histograms["str_column"][1]) == {"foo", None, "", "baz", "bar"}
def meta_conf(self): return ValueMetaConf.enabled()
def test_pandas_v0_histograms(): # Tests pandas histograms calculation is stable across Pandas v1 & v0 meta_conf = ValueMetaConf.enabled() stats, histograms = PandasHistograms(diverse_df, meta_conf).get_histograms_and_stats() # fmt: off assert stats == { "bool_column": { "count": 100, "distinct": 3, "freq": 33, "non-null": 65, "null-count": 35, "top": False, "type": "bool", "unique": 2, }, "float_column": { "25%": 2.0, "50%": 5.0, "75%": 7.0, "count": 100, "distinct": 11, "max": 9.0, "mean": 4.7127659574, "min": 0.0, "non-null": 94, "null-count": 6, "std": 2.8572576537, "type": "float64", }, "int_column": { "25%": 2.0, "50%": 5.0, "75%": 7.0, "count": 100, "distinct": 11, "max": 9.0, "mean": 4.8804347826, "min": 0.0, "non-null": 92, "null-count": 8, "std": 2.7449950111, "type": "float64", }, "str_column": { "count": 100, "distinct": 5, "freq": 22, "non-null": 79, "null-count": 21, "top": "foo", "type": "str", "unique": 4, }, } # "str_column" calculation is unstable hence these unpacked assertions assert set(histograms.keys()) == { "bool_column", "float_column", "int_column", "str_column" } assert histograms["bool_column"] == [[35, 33, 32], [None, False, True]] assert histograms["float_column"] == [ [6, 0, 9, 0, 13, 0, 8, 0, 10, 0, 0, 8, 0, 6, 0, 15, 0, 8, 0, 11], [ 0.0, 0.45, 0.9, 1.35, 1.8, 2.25, 2.7, 3.15, 3.6, 4.05, 4.5, 4.95, 5.4, 5.8500000000000005, 6.3, 6.75, 7.2, 7.65, 8.1, 8.55, 9.0 ] ] assert histograms["int_column"] == [ [2, 0, 13, 0, 9, 0, 9, 0, 7, 0, 0, 8, 0, 15, 0, 11, 0, 6, 0, 12], [ 0.0, 0.45, 0.9, 1.35, 1.8, 2.25, 2.7, 3.15, 3.6, 4.05, 4.5, 4.95, 5.4, 5.8500000000000005, 6.3, 6.75, 7.2, 7.65, 8.1, 8.55, 9.0 ] ] assert histograms["str_column"][0] == [22, 21, 20, 20, 17] # "str_column" calculation is unstable assert set(histograms["str_column"][1]) == {"foo", None, "", "baz", "bar"}
def test_pandas_histograms_work_with_NaNs_and_nonseq_index(pandas_data_frame): # Arrange pandas_data_frame = ( pandas_data_frame.drop(columns="Names").set_index( [pd.Index([90, 30, 50, 70, 10])]) # emulate real world DF indices .append([{ "foo": 42 }])) meta_conf = ValueMetaConf.enabled() # Act stats, histograms = PandasHistograms(pandas_data_frame, meta_conf).get_histograms_and_stats() # Assert assert sorted(stats.keys()) == sorted(["Births", "foo"]) # noqa assert sorted(histograms.keys()) == sorted(["Births", "foo"]) # noqa assert stats == { "Births": { "25%": 155.0, "50%": 578.0, "75%": 968.0, "count": 6, "distinct": 6, "max": 973.0, "mean": 550.2, "min": 77.0, "non-null": 5, "null-count": 1, "std": 428.4246724921, "type": "float64", }, "foo": { "25%": 42.0, "50%": 42.0, "75%": 42.0, "count": 6, "distinct": 2, "max": 42.0, "mean": 42.0, "min": 42.0, "non-null": 1, "null-count": 5, "type": "float64", }, } # fmt: off assert histograms == { "Births": [ [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2], [ 77.0, 121.8, 166.6, 211.39999999999998, 256.2, 301.0, 345.79999999999995, 390.59999999999997, 435.4, 480.2, 525.0, 569.8, 614.5999999999999, 659.4, 704.1999999999999, 749.0, 793.8, 838.5999999999999, 883.4, 928.1999999999999, 973.0 ], ], "foo": [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [ 41.5, 41.55, 41.6, 41.65, 41.7, 41.75, 41.8, 41.85, 41.9, 41.95, 42.0, 42.05, 42.1, 42.15, 42.2, 42.25, 42.3, 42.35, 42.4, 42.45, 42.5 ]] }
def test_task_metrics_histograms(self, tmpdir, pandas_data_frame): metrics_folder = target(str(tmpdir)) task_run = Mock() task_run.meta_files = TaskRunMetaFiles(metrics_folder) t = FileTrackingStore() tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t) tr_tracker.settings.features.get_value_meta_conf = Mock( return_value=ValueMetaConf.enabled() ) tr_tracker.log_data( "df", pandas_data_frame, meta_conf=ValueMetaConf.enabled(), ) hist_metrics = TaskRunMetricsFileStoreReader( metrics_folder ).get_all_metrics_values(MetricSource.histograms) expected_preview = ( " Names Births Married\n" " Bob 968 True\n" " Jessica 155 False\n" " Mary 77 True\n" " John 578 False\n" " Mel 973 True" ) # std value varies in different py versions due to float precision fluctuation df_births_std = hist_metrics["df.Births.std"] assert df_births_std == pytest.approx(428.4246) hist_metrics["df.histograms"].pop("Names") hist_metrics["df.histograms"].pop("Births") hist_metrics.pop("df.Married.top") hist_metrics.pop("df.Names.top") hist_metrics["df.stats"]["Names"].pop("top") hist_metrics["df.stats"]["Married"].pop("top") assert hist_metrics == { "df.Births.type": "int64", "df.Births.25%": 155.0, "df.Births.50%": 578.0, "df.Births.75%": 968.0, "df.Births.count": 5.0, "df.Births.distinct": 5, "df.Births.std": df_births_std, "df.Births.max": 973.0, "df.Births.mean": 550.2, "df.Births.min": 77.0, "df.Births.non-null": 5, "df.Births.null-count": 0, "df.Married.count": 5, "df.Married.distinct": 2, "df.Married.freq": 3, "df.Married.non-null": 5, "df.Married.null-count": 0, "df.Married.type": "bool", "df.Married.unique": 2, "df.Names.count": 5, "df.Names.distinct": 5, "df.Names.freq": 1, "df.Names.non-null": 5, "df.Names.null-count": 0, "df.Names.type": "object", "df.Names.unique": 5, "df.histograms": {"Married": [[3, 2], [True, False]],}, "df.preview": expected_preview, "df.schema": { "columns": ["Names", "Births", "Married"], "dtypes": {"Births": "int64", "Names": "object", "Married": "bool"}, "shape": [5, 3], "size": 15, "type": "DataFrame", }, "df.shape": [5, 3], "df.shape0": 5, "df.shape1": 3, "df.stats": { "Births": { "type": "int64", "25%": 155.0, "50%": 578.0, "75%": 968.0, "count": 5.0, "distinct": 5, "max": 973.0, "mean": 550.2, "min": 77.0, "non-null": 5, "null-count": 0, "std": df_births_std, }, "Married": { "count": 5, "distinct": 2, "freq": 3, "non-null": 5, "null-count": 0, "type": "bool", "unique": 2, }, "Names": { "count": 5, "distinct": 5, "freq": 1, "non-null": 5, "null-count": 0, "type": "object", "unique": 5, }, }, }
def meta_conf(self): conf = ValueMetaConf.enabled() return conf
class LazyValueType(DataValueType): is_lazy_evaluated = True def support_fast_count(self, target): from targets import FileTarget if not isinstance(target, FileTarget): return False from targets.target_config import FileFormat return target.config.format == FileFormat.parquet ALL_NONE = ValueMetaConf() ALL_TRUE = ValueMetaConf.enabled() ALL_FALSE = ValueMetaConf( log_size=False, log_preview=False, log_schema=False, log_stats=False, log_histograms=False, ) class TestValueMetaConf(object): @pytest.mark.parametrize( "left, right, expected", [ (ALL_NONE, ALL_TRUE, ALL_TRUE), (ALL_TRUE, ALL_NONE, ALL_TRUE),