示例#1
0
    def test_histogram_others(self, meta_conf):
        values = []
        for i in range(1, 101):
            str = "str-{}".format(i)
            new_values = [str] * i
            values.extend(new_values)

        d = {"string_column": values}
        df = pd.DataFrame(d)
        stats, histograms = PandasHistograms(
            df, meta_conf).get_histograms_and_stats()

        histogram = histograms["string_column"]
        assert len(histogram[0]) == 50 and len(histogram[1]) == 50
        assert histogram[0][0] == 100 and histogram[1][0] == "str-100"
        assert histogram[0][10] == 90 and histogram[1][10] == "str-90"
        assert histogram[0][-2] == 52 and histogram[1][-2] == "str-52"
        assert histogram[0][-1] == sum(range(
            1, 52)) and histogram[1][-1] == "_others"

        stats = stats["string_column"]
        assert stats["count"] == 5050 == sum(histogram[0])
        assert stats["non-null"] == 5050
        assert stats["null-count"] == 0
        assert stats["distinct"] == 100
示例#2
0
    def get_value_meta(self, value: SqlOperation, meta_conf):
        data_schema = {}
        data_dimensions = None

        if meta_conf.log_schema:
            data_schema = {"type": self.type_str, "dtypes": value.dtypes}

        if meta_conf.log_size:
            data_dimensions = [value.records_count, value.columns_count]
            data_schema["shape"] = data_dimensions
            # todo: size?

        # currently columns_stats and histogram are not supported
        columns_stats, histograms = [], {}
        hist_sys_metrics = None
        if meta_conf.log_stats and value.dataframe is not None:
            columns_stats, _ = PandasHistograms(
                value.dataframe, meta_conf).get_histograms_and_stats()

        return ValueMeta(
            value_preview=None,
            data_dimensions=data_dimensions,
            query=value.query,
            data_schema=data_schema,
            data_hash=str(hash(self.to_signature(value))),
            columns_stats=columns_stats,
            histogram_system_metrics=hist_sys_metrics,
            histograms=histograms,
        )
示例#3
0
    def test_numeric_histogram(self, meta_conf):
        d = {"numeric_column": [1, 3, 3, 1, 5, 1, 5, 5]}
        df = pd.DataFrame(d)
        stats, histograms = PandasHistograms(
            df, meta_conf).get_histograms_and_stats()

        stats = stats["numeric_column"]
        assert stats["count"] == 8
        assert stats["non-null"] == 8
        assert stats["distinct"] == 3
        assert stats["min"] == 1
        assert stats["max"] == 5
示例#4
0
    def get_value_meta(self, value, meta_conf):
        # type: (pd.DataFrame, ValueMetaConf) -> ValueMeta
        data_schema = {}
        if meta_conf.log_schema:
            data_schema.update({
                "type": self.type_str,
                "columns": list(value.columns),
                "shape": value.shape,
                "dtypes":
                {col: str(type_)
                 for col, type_ in value.dtypes.items()},
            })

        if meta_conf.log_size:
            data_schema["size.bytes"] = int(value.size)

        value_preview, data_hash = None, None
        if meta_conf.log_preview:
            value_preview = self.to_preview(
                value, preview_size=meta_conf.get_preview_size())
            try:
                data_hash = fast_hasher.hash(
                    hash_pandas_object(value, index=True).values)
            except Exception as e:
                logger.warning(
                    "Could not hash dataframe object %s! Exception: %s", value,
                    e)

        if meta_conf.log_histograms:
            start_time = time.time()
            stats, histograms = PandasHistograms(
                value, meta_conf).get_histograms_and_stats()
            hist_sys_metrics = {
                "histograms_and_stats_calc_time": time.time() - start_time
            }
        else:
            stats, histograms = {}, {}
            hist_sys_metrics = None

        return ValueMeta(
            value_preview=value_preview,
            data_dimensions=value.shape,
            data_schema=data_schema,
            data_hash=data_hash,
            descriptive_stats=stats,
            histogram_system_metrics=hist_sys_metrics,
            histograms=histograms,
        )
示例#5
0
    def test_strings_histogram(self, meta_conf):
        d = {
            "string_column": ["Hello World!"] * 10 + [None] * 10 +
            ["Ola Mundo!"] * 15 + ["Shalom Olam!"] * 20 + ["Ola Mundo!"] * 15
        }
        df = pd.DataFrame(d)
        stats, histograms = PandasHistograms(
            df, meta_conf).get_histograms_and_stats()

        histogram = histograms["string_column"]
        assert histogram[0] == [30, 20, 10]
        assert histogram[1] == ["Ola Mundo!", "Shalom Olam!", "Hello World!"]

        stats = stats["string_column"]
        assert stats["count"] == 60
        assert stats["non-null"] == 60
        assert stats["null-count"] == 10
        assert stats["distinct"] == 4
示例#6
0
    def test_boolean_histogram(self, meta_conf):
        d = {
            "boolean_column":
            [True] * 10 + [None] * 10 + [False] * 20 + [True] * 20
        }
        df = pd.DataFrame(d)
        stats, histograms = PandasHistograms(
            df, meta_conf).get_histograms_and_stats()

        histogram = histograms["boolean_column"]
        assert histogram[0] == [30, 20]
        assert histogram[1] == [True, False]

        stats = stats["boolean_column"]
        assert stats["count"] == 50
        assert stats["non-null"] == 50
        assert stats["null-count"] == 10
        assert stats["distinct"] == 3
示例#7
0
    def get_value_meta(self, value: RedshiftOperation, meta_conf: ValueMetaConf):

        # currently, histograms are not supported
        histograms = {}
        hist_sys_metrics = None

        dimensions = None
        if meta_conf.log_size:
            dimensions = value.schema["shape"]

        data_schema = None
        if meta_conf.log_schema:
            data_schema = value.schema

        column_stats = {}
        if meta_conf.log_stats:
            if value.dataframe is not None:
                column_stats, _ = PandasHistograms(
                    value.dataframe, meta_conf
                ).get_histograms_and_stats()
            else:
                column_stats = value.column_stats

        preview = ""
        if meta_conf.log_preview:
            preview = value.preview

        return ValueMeta(
            value_preview=preview,
            data_dimensions=dimensions,
            data_schema=data_schema,
            data_hash=str(hash(self.to_signature(value))),
            columns_stats=column_stats,
            histogram_system_metrics=hist_sys_metrics,
            histograms=histograms,
            query=value.query,
        )
示例#8
0
def test_pandas_v0_histograms():
    # Tests pandas histograms calculation is stable across Pandas v1 & v0
    meta_conf = ValueMetaConf.enabled()
    stats, histograms = PandasHistograms(diverse_df,
                                         meta_conf).get_histograms_and_stats()

    # fmt: off
    assert stats == {
        "bool_column": {
            "count": 100,
            "distinct": 3,
            "freq": 33,
            "non-null": 65,
            "null-count": 35,
            "top": False,
            "type": "bool",
            "unique": 2,
        },
        "float_column": {
            "25%": 2.0,
            "50%": 5.0,
            "75%": 7.0,
            "count": 100,
            "distinct": 11,
            "max": 9.0,
            "mean": 4.7127659574,
            "min": 0.0,
            "non-null": 94,
            "null-count": 6,
            "std": 2.8572576537,
            "type": "float64",
        },
        "int_column": {
            "25%": 2.0,
            "50%": 5.0,
            "75%": 7.0,
            "count": 100,
            "distinct": 11,
            "max": 9.0,
            "mean": 4.8804347826,
            "min": 0.0,
            "non-null": 92,
            "null-count": 8,
            "std": 2.7449950111,
            "type": "float64",
        },
        "str_column": {
            "count": 100,
            "distinct": 5,
            "freq": 22,
            "non-null": 79,
            "null-count": 21,
            "top": "foo",
            "type": "str",
            "unique": 4,
        },
    }
    # "str_column" calculation is unstable hence these unpacked assertions
    assert set(histograms.keys()) == {
        "bool_column", "float_column", "int_column", "str_column"
    }
    assert histograms["bool_column"] == [[35, 33, 32], [None, False, True]]
    assert histograms["float_column"] == [
        [6, 0, 9, 0, 13, 0, 8, 0, 10, 0, 0, 8, 0, 6, 0, 15, 0, 8, 0, 11],
        [
            0.0, 0.45, 0.9, 1.35, 1.8, 2.25, 2.7, 3.15, 3.6, 4.05, 4.5, 4.95,
            5.4, 5.8500000000000005, 6.3, 6.75, 7.2, 7.65, 8.1, 8.55, 9.0
        ]
    ]
    assert histograms["int_column"] == [
        [2, 0, 13, 0, 9, 0, 9, 0, 7, 0, 0, 8, 0, 15, 0, 11, 0, 6, 0, 12],
        [
            0.0, 0.45, 0.9, 1.35, 1.8, 2.25, 2.7, 3.15, 3.6, 4.05, 4.5, 4.95,
            5.4, 5.8500000000000005, 6.3, 6.75, 7.2, 7.65, 8.1, 8.55, 9.0
        ]
    ]
    assert histograms["str_column"][0] == [22, 21, 20, 20, 17]
    # "str_column" calculation is unstable
    assert set(histograms["str_column"][1]) == {"foo", None, "", "baz", "bar"}
示例#9
0
def test_pandas_histograms_work_with_NaNs_and_nonseq_index(pandas_data_frame):
    # Arrange
    pandas_data_frame = (
        pandas_data_frame.drop(columns="Names").set_index(
            [pd.Index([90, 30, 50, 70, 10])])  # emulate real world DF indices
        .append([{
            "foo": 42
        }]))
    meta_conf = ValueMetaConf.enabled()

    # Act
    stats, histograms = PandasHistograms(pandas_data_frame,
                                         meta_conf).get_histograms_and_stats()

    # Assert
    assert sorted(stats.keys()) == sorted(["Births", "foo"])  # noqa
    assert sorted(histograms.keys()) == sorted(["Births", "foo"])  # noqa
    assert stats == {
        "Births": {
            "25%": 155.0,
            "50%": 578.0,
            "75%": 968.0,
            "count": 6,
            "distinct": 6,
            "max": 973.0,
            "mean": 550.2,
            "min": 77.0,
            "non-null": 5,
            "null-count": 1,
            "std": 428.4246724921,
            "type": "float64",
        },
        "foo": {
            "25%": 42.0,
            "50%": 42.0,
            "75%": 42.0,
            "count": 6,
            "distinct": 2,
            "max": 42.0,
            "mean": 42.0,
            "min": 42.0,
            "non-null": 1,
            "null-count": 5,
            "type": "float64",
        },
    }
    # fmt: off
    assert histograms == {
        "Births": [
            [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 2],
            [
                77.0, 121.8, 166.6, 211.39999999999998, 256.2, 301.0,
                345.79999999999995, 390.59999999999997, 435.4, 480.2, 525.0,
                569.8, 614.5999999999999, 659.4, 704.1999999999999, 749.0,
                793.8, 838.5999999999999, 883.4, 928.1999999999999, 973.0
            ],
        ],
        "foo": [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0],
                [
                    41.5, 41.55, 41.6, 41.65, 41.7, 41.75, 41.8, 41.85, 41.9,
                    41.95, 42.0, 42.05, 42.1, 42.15, 42.2, 42.25, 42.3, 42.35,
                    42.4, 42.45, 42.5
                ]]
    }
示例#10
0
def test_pandas_v0_histograms():
    # Tests pandas histograms calculation is stable across Pandas v1 & v0
    meta_conf = ValueMetaConf.enabled()
    columns_stats, histograms = PandasHistograms(
        diverse_df, meta_conf).get_histograms_and_stats()

    # fmt: off
    columns_stats == [
        ColumnStatsArgs(
            column_name="bool_column",
            column_type="bool",
            records_count=100,
            distinct_count=3,
            null_count=35,
            most_freq_value=False,
            most_freq_value_count=33,
            unique_count=2,
        ),
        ColumnStatsArgs(
            column_name="float_column",
            column_type="float64",
            records_count=100,
            distinct_count=11,
            null_count=6,
            quartile_1=2.0,
            quartile_2=5.0,
            quartile_3=7.0,
            max_value=9.0,
            mean_value=4.7127659574,
            min_value=0.0,
            std_value=2.8572576537,
        ),
        ColumnStatsArgs(
            column_name="int_column",
            column_type="float64",
            records_count=100,
            distinct_count=11,
            null_count=8,
            quartile_1=2.0,
            quartile_2=5.0,
            quartile_3=7.0,
            max_value=9.0,
            mean_value=4.8804347826,
            min_value=0.0,
            std_value=2.7449950111,
        ),
        ColumnStatsArgs(
            column_name="str_column",
            column_type="str",
            records_count=100,
            distinct_count=5,
            null_count=21,
            most_freq_value="foo",
            most_freq_value_count=22,
            unique_count=4,
        ),
        ColumnStatsArgs(
            column_name="multi_data_types",
            column_type="str",
            records_count=100,
            distinct_count=8,
            null_count=10,
            most_freq_value="foo",
            most_freq_value_count=11,
            unique_count=18,
        ),
    ]
    # "str_column" calculation is unstable hence these unpacked assertions
    assert set(histograms.keys()) == {
        "bool_column", "float_column", "int_column", "str_column",
        "multi_data_types"
    }
    assert histograms["bool_column"] == [[35, 33, 32], [None, False, True]]
    assert histograms["float_column"] == [
        [6, 0, 9, 0, 13, 0, 8, 0, 10, 0, 0, 8, 0, 6, 0, 15, 0, 8, 0, 11],
        [
            0.0, 0.45, 0.9, 1.35, 1.8, 2.25, 2.7, 3.15, 3.6, 4.05, 4.5, 4.95,
            5.4, 5.8500000000000005, 6.3, 6.75, 7.2, 7.65, 8.1, 8.55, 9.0
        ]
    ]
    assert histograms["int_column"] == [
        [2, 0, 13, 0, 9, 0, 9, 0, 7, 0, 0, 8, 0, 15, 0, 11, 0, 6, 0, 12],
        [
            0.0, 0.45, 0.9, 1.35, 1.8, 2.25, 2.7, 3.15, 3.6, 4.05, 4.5, 4.95,
            5.4, 5.8500000000000005, 6.3, 6.75, 7.2, 7.65, 8.1, 8.55, 9.0
        ]
    ]
    assert histograms["str_column"][0] == [22, 21, 20, 20, 17]
    # "str_column" calculation is unstable
    assert set(histograms["str_column"][1]) == {"foo", None, "", "baz", "bar"}