def test_df_value_meta(
        self, pandas_data_frame, pandas_data_frame_histograms, pandas_data_frame_stats
    ):
        expected_data_schema = {
            "type": DataFrameValueType.type_str,
            "columns": list(pandas_data_frame.columns),
            "size": int(pandas_data_frame.size),
            "shape": pandas_data_frame.shape,
            "dtypes": {
                col: str(type_) for col, type_ in pandas_data_frame.dtypes.items()
            },
        }

        meta_conf = ValueMetaConf.enabled()
        expected_value_meta = ValueMeta(
            value_preview=DataFrameValueType().to_preview(
                pandas_data_frame, preview_size=meta_conf.get_preview_size()
            ),
            data_dimensions=pandas_data_frame.shape,
            data_schema=expected_data_schema,
            data_hash=fast_hasher.hash(
                hash_pandas_object(pandas_data_frame, index=True).values
            ),
            descriptive_stats=pandas_data_frame_stats,
            histograms=pandas_data_frame_histograms,
        )

        df_value_meta = DataFrameValueType().get_value_meta(
            pandas_data_frame, meta_conf=meta_conf
        )

        assert df_value_meta.value_preview == expected_value_meta.value_preview
        assert df_value_meta.data_hash == expected_value_meta.data_hash
        assert json_utils.dumps(df_value_meta.data_schema) == json_utils.dumps(
            expected_value_meta.data_schema
        )
        assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions

        std = df_value_meta.descriptive_stats["Births"].pop("std")
        expected_std = expected_value_meta.descriptive_stats["Births"].pop("std")
        assert round(std, 2) == expected_std
        df_value_meta.descriptive_stats["Names"].pop("top")
        assert df_value_meta.descriptive_stats == expected_value_meta.descriptive_stats

        counts, values = df_value_meta.histograms.pop("Names")
        expected_counts, expected_values = expected_value_meta.histograms.pop("Names")
        assert counts == expected_counts
        assert set(values) == set(expected_values)  # order changes in each run
        # histograms are tested in histogram tests and they change a lot, no need to test also here
        df_value_meta.histograms = expected_value_meta.histograms = None

        expected_value_meta.histogram_system_metrics = (
            df_value_meta.histogram_system_metrics
        )
        assert df_value_meta.data_schema == expected_value_meta.data_schema
        assert attr.asdict(df_value_meta) == attr.asdict(expected_value_meta)
Exemplo n.º 2
0
    def test_spark_df_value_meta(self, spark_data_frame):
        expected_data_schema = {
            "type": SparkDataFrameValueType.type_str,
            "columns": list(spark_data_frame.schema.names),
            "size":
            int(spark_data_frame.count() * len(spark_data_frame.columns)),
            "shape": (spark_data_frame.count(), len(spark_data_frame.columns)),
            "dtypes":
            {f.name: str(f.dataType)
             for f in spark_data_frame.schema.fields},
        }

        meta_conf = ValueMetaConf.enabled()
        expected_value_meta = ValueMeta(
            value_preview=SparkDataFrameValueType().to_preview(
                spark_data_frame, meta_conf.get_preview_size()),
            data_dimensions=(spark_data_frame.count(),
                             len(spark_data_frame.columns)),
            data_schema=expected_data_schema,
            data_hash=None,
        )

        df_value_meta = SparkDataFrameValueType().get_value_meta(
            spark_data_frame)

        assert df_value_meta.value_preview == expected_value_meta.value_preview
        assert df_value_meta.data_hash == expected_value_meta.data_hash
        assert df_value_meta.data_schema == expected_value_meta.data_schema
        assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions
        assert df_value_meta == expected_value_meta
Exemplo n.º 3
0
    def get_value_meta(self, value: SqlOperation, meta_conf):
        data_schema = {}
        data_dimensions = None

        if meta_conf.log_schema:
            data_schema = {"type": self.type_str, "dtypes": value.dtypes}

        if meta_conf.log_size:
            data_dimensions = [value.records_count, value.columns_count]
            data_schema["shape"] = data_dimensions
            # todo: size?

        # currently columns_stats and histogram are not supported
        columns_stats, histograms = [], {}
        hist_sys_metrics = None

        return ValueMeta(
            value_preview=None,
            data_dimensions=data_dimensions,
            data_schema=data_schema,
            data_hash=str(hash(self.to_signature(value))),
            columns_stats=columns_stats,
            histogram_system_metrics=hist_sys_metrics,
            histograms=histograms,
        )
Exemplo n.º 4
0
    def get_value_meta(self, value, meta_conf):
        # type: (list, ValueMetaConf) -> ValueMeta
        data_schema = self.get_list_metrics(value, meta_conf)
        data_dimensions = data_schema.get("shape")
        if meta_conf.log_size:
            data_schema["size.bytes"] = value.__sizeof__()

        value_preview, data_hash = None, None
        if meta_conf.log_preview:
            value_preview = self.to_preview(
                value, preview_size=self.get_preview_size(meta_conf)
            )
            try:
                data_hash = hash(json.dumps(value))
            except Exception as e:
                logger.warning("Could not hash list %s! Exception: %s", value, e)

        # calculating stats, metrics and histograms are out of scope at the moment
        stats, histograms = [], {}
        hist_sys_metrics = None

        return ValueMeta(
            value_preview=value_preview,
            data_dimensions=data_dimensions,
            data_schema=data_schema if meta_conf.log_schema else None,
            data_hash=data_hash,
            columns_stats=stats,
            histogram_system_metrics=hist_sys_metrics,
            histograms=histograms,
        )
Exemplo n.º 5
0
    def test_df_value_meta(self, pandas_data_frame):
        expected_data_schema = {
            "type": DataFrameValueType.type_str,
            "columns": list(pandas_data_frame.columns),
            "size": int(pandas_data_frame.size),
            "shape": pandas_data_frame.shape,
            "dtypes": {
                col: str(type_)
                for col, type_ in pandas_data_frame.dtypes.items()
            },
        }

        meta_conf = ValueMetaConf.enabled()
        expected_value_meta = ValueMeta(
            value_preview=DataFrameValueType().to_preview(
                pandas_data_frame, preview_size=meta_conf.get_preview_size()),
            data_dimensions=pandas_data_frame.shape,
            data_schema=expected_data_schema,
            data_hash=fast_hasher.hash(
                hash_pandas_object(pandas_data_frame, index=True).values),
        )

        df_value_meta = DataFrameValueType().get_value_meta(
            pandas_data_frame, meta_conf=meta_conf)

        assert df_value_meta.value_preview == expected_value_meta.value_preview
        assert df_value_meta.data_hash == expected_value_meta.data_hash
        assert json_utils.dumps(df_value_meta.data_schema) == json_utils.dumps(
            expected_value_meta.data_schema)
        assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions
        assert df_value_meta == expected_value_meta
Exemplo n.º 6
0
    def get_value_meta(self, value, meta_conf):
        # type: (PostgresTable, ValueMetaConf) -> ValueMeta
        data_schema = data_preview = None

        with PostgresController(value.connection_string,
                                value.table_name) as postgres:
            if meta_conf.log_histograms or meta_conf.log_stats:
                start_time = time.time()
                stats, histograms = postgres.get_histograms_and_stats(
                    meta_conf)
                hist_sys_metrics = {
                    "histograms_and_stats_calc_time": time.time() - start_time
                }
            else:
                stats, histograms = {}, {}
                hist_sys_metrics = None
            if meta_conf.log_preview:
                data_preview = postgres.to_preview()
            if meta_conf.log_schema:
                data_schema = {
                    "type": self.type_str,
                    "column_types": postgres.get_column_types(),
                }

        return ValueMeta(
            value_preview=data_preview,
            data_dimensions=None,
            data_schema=data_schema,
            data_hash=self.to_signature(value),
            descriptive_stats=stats,
            histogram_system_metrics=hist_sys_metrics,
            histograms=histograms,
        )
Exemplo n.º 7
0
    def get_value_meta(self, value, meta_conf):
        # type: (pd.DataFrame, ValueMetaConf) -> ValueMeta
        data_schema = {}
        if meta_conf.log_schema:
            data_schema.update({
                "type": self.type_str,
                "columns": list(value.columns),
                "shape": value.shape,
                "dtypes":
                {col: str(type_)
                 for col, type_ in value.dtypes.items()},
            })

        if meta_conf.log_size:
            data_schema["size"] = int(value.size)

        if meta_conf.log_preview:
            value_preview = self.to_preview(
                value, preview_size=meta_conf.get_preview_size())
            data_hash = fast_hasher.hash(
                hash_pandas_object(value, index=True).values)
        else:
            value_preview = None
            data_hash = None

        return ValueMeta(
            value_preview=value_preview,
            data_dimensions=value.shape,
            data_schema=data_schema,
            data_hash=data_hash,
        )
Exemplo n.º 8
0
    def get_value_meta(self, value, meta_conf):
        # type: (SnowflakeTable, ValueMetaConf) -> ValueMeta
        data_schema = {}
        data_preview = data_dimensions = None

        with self.get_snowflake(value) as snowflake:
            stats, histograms = {}, {}
            hist_sys_metrics = None
            if meta_conf.log_preview:
                data_preview = snowflake.to_preview(value)
            if meta_conf.log_schema:
                data_schema = {
                    "type": self.type_str,
                    "column_types": snowflake.get_column_types(value),
                }
            if meta_conf.log_size:
                dimensions = snowflake.get_dimensions(value)
                data_dimensions = [dimensions["rows"], dimensions["cols"]]
                data_schema["size"] = humanize_bytes(dimensions["bytes"])

        return ValueMeta(
            value_preview=data_preview,
            data_dimensions=data_dimensions,
            data_schema=data_schema,
            data_hash=self.to_signature(value),
            descriptive_stats=stats,
            histogram_system_metrics=hist_sys_metrics,
            histograms=histograms,
        )
Exemplo n.º 9
0
    def get_value_meta(self, value, meta_conf):
        # type: (SnowflakeTable, ValueMetaConf) -> ValueMeta
        data_schema = {}
        data_preview = data_dimensions = None
        if meta_conf.log_preview:
            data_preview = value.snowflake_ctrl.to_preview(value)

        if meta_conf.log_schema:
            data_schema = {
                "type": self.type_str,
                "dtypes": value.snowflake_ctrl.get_column_types(value),
            }

        if meta_conf.log_size:
            dimensions = value.snowflake_ctrl.get_dimensions(value)
            data_dimensions = [dimensions["rows"], dimensions["cols"]]
            data_schema["size.bytes"] = dimensions["bytes"]

        # currently columns_stats and histogram are not supported
        columns_stats, histograms = [], {}
        hist_sys_metrics = None

        return ValueMeta(
            value_preview=data_preview,
            data_dimensions=data_dimensions,
            data_schema=data_schema,
            data_hash=str(hash(self.to_signature(value))),
            columns_stats=columns_stats,
            histogram_system_metrics=hist_sys_metrics,
            histograms=histograms,
        )
Exemplo n.º 10
0
 def test_str_value_meta(self):
     str_value_meta = StrValueType().get_value_meta("foo", ValueMetaConf.enabled())
     expected_value_meta = ValueMeta(
         value_preview="foo",
         data_dimensions=None,
         data_schema={"type": "str"},
         data_hash=fast_hasher.hash("foo"),
     )
     assert str_value_meta == expected_value_meta
    def test_spark_df_value_meta(self, spark_data_frame,
                                 spark_data_frame_histograms,
                                 spark_data_frame_stats):
        expected_data_schema = {
            "type":
            SparkDataFrameValueType.type_str,
            "columns":
            list(spark_data_frame.schema.names),
            "size.bytes":
            int(spark_data_frame.count() * len(spark_data_frame.columns)),
            "shape": (spark_data_frame.count(), len(spark_data_frame.columns)),
            "dtypes":
            {f.name: str(f.dataType)
             for f in spark_data_frame.schema.fields},
        }

        expected_hist_sys_metrics = {
            "boolean_histograms_and_stats_calc_time",
            "histograms_and_stats_calc_time",
            "numeric_histograms_and_stats_calc_time",
            "string_histograms_and_stats_calc_time",
        }

        meta_conf = ValueMetaConf.enabled()
        expected_value_meta = ValueMeta(
            value_preview=SparkDataFrameValueType().to_preview(
                spark_data_frame, meta_conf.get_preview_size()),
            data_dimensions=(spark_data_frame.count(),
                             len(spark_data_frame.columns)),
            data_hash=SparkDataFrameValueType().to_signature(spark_data_frame),
            data_schema=expected_data_schema,
            descriptive_stats=spark_data_frame_stats,
            histograms=spark_data_frame_histograms,
        )

        df_value_meta = SparkDataFrameValueType().get_value_meta(
            spark_data_frame, meta_conf)

        assert df_value_meta.value_preview == expected_value_meta.value_preview
        assert df_value_meta.data_hash == expected_value_meta.data_hash
        assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions
        assert df_value_meta.data_schema == expected_value_meta.data_schema
        # it changes all the time, it has different formats, and it's already tested in histogram tests
        # assert df_value_meta.descriptive_stats == expected_value_meta.descriptive_stats

        # histogram_system_metrics values are too dynamic, so checking only keys, but not values
        assert (set(df_value_meta.histogram_system_metrics.keys()) ==
                expected_hist_sys_metrics)
        df_value_meta.histogram_system_metrics = None

        # assert df_value_meta.histograms == expected_value_meta.histograms
        # assert attr.asdict(df_value_meta) == attr.asdict(expected_value_meta)

        pandas_data_frame = spark_data_frame.toPandas()
        pandas_value_meta = DataFrameValueType().get_value_meta(
            pandas_data_frame, meta_conf)
Exemplo n.º 12
0
    def test_target_value_meta(self):
        v = target("a")
        meta_conf = ValueMetaConf.enabled()
        target_value_meta = TargetPathLibValueType().get_value_meta(
            v, meta_conf=meta_conf)

        expected_value_meta = ValueMeta(
            value_preview='"a"',
            data_dimensions=None,
            data_schema={"type": "Path"},
            data_hash=fast_hasher.hash(v),
        )

        assert target_value_meta == expected_value_meta
Exemplo n.º 13
0
    def _calc_meta_data(self, data, meta_conf):
        # type: (Any, ValueMetaConf) -> ValueMeta
        data_meta = None
        if data is not None and meta_conf is not None:
            # Combine meta_conf with the config settings
            try:
                data_meta = get_value_meta(
                    data, meta_conf, tracking_config=self.settings.tracking)
            except Exception as e:
                log_exception_to_server(e)

        if data_meta is None:
            data_meta = ValueMeta("")

        return data_meta
Exemplo n.º 14
0
    def get_value_meta(self, value, meta_conf):
        # type: (pd.DataFrame, ValueMetaConf) -> ValueMeta
        data_schema = {}
        if meta_conf.log_schema:
            data_schema.update({
                "type": self.type_str,
                "columns": list(value.columns),
                "shape": value.shape,
                "dtypes":
                {col: str(type_)
                 for col, type_ in value.dtypes.items()},
            })

        if meta_conf.log_size:
            data_schema["size.bytes"] = int(value.size)

        value_preview, data_hash = None, None
        if meta_conf.log_preview:
            value_preview = self.to_preview(
                value, preview_size=meta_conf.get_preview_size())
            try:
                data_hash = fast_hasher.hash(
                    hash_pandas_object(value, index=True).values)
            except Exception as e:
                logger.warning(
                    "Could not hash dataframe object %s! Exception: %s", value,
                    e)

        if meta_conf.log_histograms:
            start_time = time.time()
            stats, histograms = PandasHistograms(
                value, meta_conf).get_histograms_and_stats()
            hist_sys_metrics = {
                "histograms_and_stats_calc_time": time.time() - start_time
            }
        else:
            stats, histograms = {}, {}
            hist_sys_metrics = None

        return ValueMeta(
            value_preview=value_preview,
            data_dimensions=value.shape,
            data_schema=data_schema,
            data_hash=data_hash,
            descriptive_stats=stats,
            histogram_system_metrics=hist_sys_metrics,
            histograms=histograms,
        )
Exemplo n.º 15
0
    def get_value_meta(self, value, meta_conf):
        # type: (spark.DataFrame, ValueMetaConf) -> ValueMeta

        if meta_conf.log_schema:
            data_schema = {
                "type": self.type_str,
                "columns": list(value.schema.names),
                "dtypes":
                {f.name: str(f.dataType)
                 for f in value.schema.fields},
            }
        else:
            data_schema = None

        if meta_conf.log_preview:
            data_preview = self.to_preview(value, meta_conf.get_preview_size())
        else:
            data_preview = None

        if meta_conf.log_size:
            data_schema = data_schema or {}
            rows = value.count()
            data_dimensions = (rows, len(value.columns))
            data_schema.update({
                "size": int(rows * len(value.columns)),
                "shape": (rows, len(value.columns)),
            })
        else:
            data_dimensions = None

        if meta_conf.log_histograms or meta_conf.log_stats:
            spark_histograms = SparkHistograms(value, meta_conf)
            df_stats, histogram_dict = spark_histograms.get_histograms_and_stats(
            )
            hist_sys_metrics = spark_histograms.system_metrics
        else:
            df_stats, histogram_dict = {}, {}
            hist_sys_metrics = None

        return ValueMeta(
            value_preview=data_preview,
            data_dimensions=data_dimensions,
            data_schema=data_schema,
            data_hash=self.to_signature(value),
            descriptive_stats=df_stats,
            histogram_system_metrics=hist_sys_metrics,
            histograms=histogram_dict,
        )
Exemplo n.º 16
0
    def validate_numeric_histogram_and_stats(self, value_meta: ValueMeta,
                                             column_name: str) -> None:
        """assuming numbers fixture is used"""
        assert column_name in value_meta.histograms
        histogram = value_meta.histograms[column_name]
        assert len(histogram) == 2
        assert len(histogram[0]) == 20
        assert len(histogram[1]) == 21
        assert sum(histogram[0]) == 8

        col_stats = value_meta.get_column_stats_by_col_name(column_name)
        assert col_stats.records_count == 10
        assert col_stats.non_null_count == 8
        assert col_stats.distinct_count == 4
        assert col_stats.min_value == 1
        assert col_stats.max_value == 5
Exemplo n.º 17
0
    def get_value_meta(self, value, meta_conf):
        # type: (spark.DataFrame, ValueMetaConf) -> ValueMeta

        if meta_conf.log_schema:
            data_schema = {
                "type": self.type_str,
                "columns": list(value.schema.names),
                "dtypes":
                {f.name: str(f.dataType)
                 for f in value.schema.fields},
            }
        else:
            data_schema = None

        if meta_conf.log_preview:
            data_preview = self.to_preview(value, meta_conf.get_preview_size())
        else:
            data_preview = None

        if meta_conf.log_stats:
            data_schema["stats"] = self.to_preview(
                value.summary(), meta_conf.get_preview_size())

        if meta_conf.log_size:
            data_schema = data_schema or {}
            rows = value.count()
            data_dimensions = (rows, len(value.columns))
            data_schema.update({
                "size": int(rows * len(value.columns)),
                "shape": (rows, len(value.columns)),
            })
        else:
            data_dimensions = None

        df_stats, histograms = None, None
        if meta_conf.log_df_hist:
            df_stats, histograms = self.get_histograms(value)

        return ValueMeta(
            value_preview=data_preview,
            data_dimensions=data_dimensions,
            data_schema=data_schema,
            data_hash=self.to_signature(value),
            descriptive_stats=df_stats,
            histograms=histograms,
        )
Exemplo n.º 18
0
    def get_value_meta(self, value, meta_conf):
        # type: (Any,  ValueMetaConf) -> ValueMeta

        if meta_conf.log_preview:
            preview = self.to_preview(value, preview_size=meta_conf.get_preview_size())
            data_hash = _safe_hash(value)
        else:
            preview = None
            data_hash = None

        data_schema = {"type": self.type_str}

        return ValueMeta(
            value_preview=preview,
            data_dimensions=None,
            data_schema=data_schema,
            data_hash=data_hash,
        )
Exemplo n.º 19
0
    def get_value_meta(self, value: RedshiftOperation, meta_conf: ValueMetaConf):

        # currently, histograms are not supported
        histograms = {}
        hist_sys_metrics = None

        dimensions = None
        if meta_conf.log_size:
            dimensions = value.schema["shape"]

        data_schema = None
        if meta_conf.log_schema:
            data_schema = value.schema

        column_stats = {}
        if meta_conf.log_stats:
            if value.dataframe is not None:
                column_stats, _ = PandasHistograms(
                    value.dataframe, meta_conf
                ).get_histograms_and_stats()
            else:
                column_stats = value.column_stats

        preview = ""
        if meta_conf.log_preview:
            preview = value.preview

        return ValueMeta(
            value_preview=preview,
            data_dimensions=dimensions,
            data_schema=data_schema,
            data_hash=str(hash(self.to_signature(value))),
            columns_stats=column_stats,
            histogram_system_metrics=hist_sys_metrics,
            histograms=histograms,
            query=value.query,
        )
Exemplo n.º 20
0
    def test_df_value_meta(self, pandas_data_frame):
        expected_data_schema = {
            "type": DataFrameValueType.type_str,
            "columns": list(pandas_data_frame.columns),
            "size.bytes": int(pandas_data_frame.size),
            "shape": pandas_data_frame.shape,
            "dtypes": {
                col: str(type_)
                for col, type_ in pandas_data_frame.dtypes.items()
            },
        }

        meta_conf = ValueMetaConf.enabled()
        expected_value_meta = ValueMeta(
            value_preview=DataFrameValueType().to_preview(
                pandas_data_frame, preview_size=meta_conf.get_preview_size()),
            data_dimensions=pandas_data_frame.shape,
            data_schema=expected_data_schema,
            data_hash=fast_hasher.hash(
                hash_pandas_object(pandas_data_frame, index=True).values),
        )

        df_value_meta = DataFrameValueType().get_value_meta(
            pandas_data_frame, meta_conf=meta_conf)

        assert df_value_meta.value_preview == expected_value_meta.value_preview
        assert df_value_meta.data_hash == expected_value_meta.data_hash
        assert df_value_meta.data_schema == expected_value_meta.data_schema

        assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions
        assert df_value_meta.data_schema == expected_value_meta.data_schema

        # histograms and stats are tested in histogram tests and they change a lot, no need to test also here
        assert set([
            col_stats.column_name for col_stats in df_value_meta.columns_stats
        ]) == {"Names", "Births"}
        assert set(df_value_meta.histograms.keys()) == {"Names", "Births"}