示例#1
0
def test_string_as_arrays_does_not_throw():
    InferredType.Type
    c = ColumnProfile("col")
    data = "[0,0]"  # this string will be parsed as an array
    c.track(data)
    summary: ColumnSummary = c.to_summary()
    assert summary.schema.inferred_type.type == InferredType.Type.UNKNOWN
示例#2
0
 def track_datum(self, column_name, data):
     try:
         prof = self.columns[column_name]
     except KeyError:
         prof = ColumnProfile(column_name)
         self.columns[column_name] = prof
     prof.track(data)
示例#3
0
def test_summary():
    c = ColumnProfile("col")
    for n in [1, 2, 3]:
        c.track(n)
    summary = c.to_summary()
    actual_val = message_to_dict(summary)
    expected_val = {
        "counters": {
            "count": "3",
        },
        "schema": {
            "inferredType": {"type": "INTEGRAL", "ratio": 1.0},
            "typeCounts": {"INTEGRAL": "3"},
        },
        "numberSummary": {
            "count": "3",
            "min": 1.0,
            "max": 3.0,
            "mean": 2.0,
            "stddev": 1.0,
            "isDiscrete": False,
            "histogram": {
                "start": 1.0,
                "end": 3.0000003,
                "counts": ["3"],
                "max": 3.0,
                "min": 1.0,
                "bins": [1.0, 3.0000003],
                "n": "3",
                "width": 0.0,
            },
            "quantiles": {
                "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0],
                "quantileValues": [1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 3.0, 3.0, 3.0],
            },
            "uniqueCount": {"estimate": 3.0, "upper": 3.0, "lower": 3.0},
        },
    }
    # Top-level unique count needs to be approximately equal
    expected_unique = {
        "estimate": 3.000000014901161,
        "lower": 3.0,
        "upper": 3.0001498026537594,
    }
    actual_unique = actual_val.pop("uniqueCount")
    assert actual_unique == pytest.approx(expected_unique, 0.0001)

    # Cannot do a straightforward frequentItems count since order is ambiguous
    actual_freq = actual_val.pop("frequentItems")
    assert set(actual_freq.keys()) == {"items"}
    expected = [("1", "1"), ("2", "1"), ("3", "1")]
    assert len(actual_freq["items"]) == len(expected)
    counts = []
    for v in actual_freq["items"]:
        counts.append((v["jsonValue"], v["estimate"]))
    assert set(counts) == set(expected)

    # Compare the messages, excluding the frequent numbers counters
    assert actual_val == expected_val
示例#4
0
def test_mostly_nulls_inferred_type_not_null():
    Type = InferredType.Type
    c = ColumnProfile("col")
    data = [None, np.nan, None] * 3 + ["not a null val!"]
    for val in data:
        c.track(val)
    summary = c.to_summary()
    assert summary.schema.inferred_type.type != Type.NULL
示例#5
0
def test_all_nulls_inferred_type_null(data, nulls_expected, expected_type):
    InferredType.Type
    c = ColumnProfile("col")
    for val in data:
        c.track(val)
    summary: ColumnSummary = c.to_summary()
    assert summary.counters.null_count.value == nulls_expected
    assert summary.schema.inferred_type.type == expected_type
示例#6
0
    def track_datum(self, column_name, data, character_list=None, token_method=None):
        try:
            prof = self.columns[column_name]
        except KeyError:
            constraints = None if self.constraints is None else self.constraints[column_name]
            prof = ColumnProfile(column_name, constraints=constraints)
            self.columns[column_name] = prof

        prof.track(data, character_list=None, token_method=None)
示例#7
0
def test_frequent_items_do_not_track_nulls():
    data = [None, np.nan, None]
    c = ColumnProfile("col")
    for val in data:
        c.track(val)
    assert c.frequent_items.to_summary() is None
    assert c.frequent_items.is_empty()
    assert c.cardinality_tracker.is_empty()
    assert c.cardinality_tracker.to_summary() is None
示例#8
0
    def track_datum(self, column_name, data):
        try:
            prof = self.columns[column_name]
        except KeyError:
            constraints = None if self.constraints is None else self.constraints[
                column_name]
            prof = ColumnProfile(column_name, constraints=constraints)
            self.columns[column_name] = prof

        prof.track(data)
def test_all_nulls_inferred_type_null():
    import numpy as np
    from whylogs.proto import InferredType

    Type = InferredType.Type
    c = ColumnProfile("col")
    data = [None, np.nan, None] * 3
    for val in data:
        c.track(val)
    summary = c.to_summary()
    assert summary.schema.inferred_type.type == Type.NULL
def test_merge():
    col = ColumnProfile("test")
    vals = [1, 1.0, "string", True, False, None]
    for v in vals:
        col.track(v)

    merged = col.merge(col)
    assert merged.counters.count == 12
    assert merged.counters.null_count == 2
    assert merged.counters.true_count == 4
    assert merged.number_tracker.ints.count == 0
    assert merged.number_tracker.floats.count == 4
    assert merged.string_tracker.count == 2
def test_track():
    c = ColumnProfile("col")
    data = [1, 2, 3, "string 1", "string 2", "3", 4.0, "3.95", "3.95st", None, True]
    for val in data:
        c.track(val)
    nt = c.number_tracker
    assert nt.floats.count == 6
    assert nt.ints.count == 0
    assert nt.floats.min == 1.0
    assert nt.floats.max == 4.0

    assert c.counters.count == len(data)
    assert c.counters.null_count == 1
    assert c.counters.true_count == 1
示例#12
0
def test_all_numeric_types_get_tracked_by_number_tracker():
    all_values = [
        [1.0, 2.0, 3.0],
        [1, 2, 3],
        np.arange(4),
        np.linspace(1, 2, 5),
        pd.Series(np.arange(3)),
        np.zeros(3, dtype=np.int32),
        np.zeros(3, dtype=np.int16),
    ]
    for values in all_values:
        c = ColumnProfile("test")
        for v in values:
            c.track(v)
        assert c.number_tracker.count == len(values)
示例#13
0
    def from_protobuf(message: DatasetProfileMessage):
        """
        Load from a protobuf message

        Parameters
        ----------
        message : DatasetProfileMessage
            The protobuf message.  Should match the output of
            `DatasetProfile.to_protobuf()`

        Returns
        -------
        dataset_profile : DatasetProfile
        """
        return DatasetProfile(
            name=message.properties.tags["Name"],
            session_id=message.properties.session_id,
            session_timestamp=from_utc_ms(message.properties.session_timestamp),
            data_timestamp=from_utc_ms(message.properties.data_timestamp),
            columns={
                k: ColumnProfile.from_protobuf(v) for k, v in message.columns.items()
            },
            tags=dict(message.properties.tags),
            metadata=dict(message.properties.metadata),
        )
示例#14
0
    def from_protobuf(message: DatasetProfileMessage) -> "DatasetProfile":
        """
        Load from a protobuf message

        Parameters
        ----------
        message : DatasetProfileMessage
            The protobuf message.  Should match the output of
            `DatasetProfile.to_protobuf()`

        Returns
        -------
        dataset_profile : DatasetProfile
        """
        properties: DatasetProperties = message.properties
        name = (properties.tags or {}).get(
            "name", None) or (properties.tags or {}).get("Name", None) or ""

        return DatasetProfile(
            name=name,
            session_id=properties.session_id,
            session_timestamp=from_utc_ms(properties.session_timestamp),
            dataset_timestamp=from_utc_ms(properties.data_timestamp),
            columns={
                k: ColumnProfile.from_protobuf(v)
                for k, v in message.columns.items()
            },
            tags=dict(properties.tags or {}),
            metadata=dict(properties.metadata or {}),
            model_profile=ModelProfile.from_protobuf(message.modeProfile),
        )
示例#15
0
    def _do_merge(self, other):
        columns_set = set(
            list(self.columns.keys()) + list(other.columns.keys()))

        columns = {}
        for col_name in columns_set:
            constraints = None if self.constraints is None else self.constraints[
                col_name]
            empty_column = ColumnProfile(col_name, constraints=constraints)
            this_column = self.columns.get(col_name, empty_column)
            other_column = other.columns.get(col_name, empty_column)
            columns[col_name] = this_column.merge(other_column)

        if self.model_profile is not None:
            new_model_profile = self.model_profile.merge(other.model_profile)
        else:
            new_model_profile = other.model_profile

        return DatasetProfile(
            name=self.name,
            session_id=self.session_id,
            session_timestamp=self.session_timestamp,
            dataset_timestamp=self.dataset_timestamp,
            columns=columns,
            tags=self.tags,
            metadata=self.metadata,
            model_profile=new_model_profile,
        )
示例#16
0
def test_fallback_number_counter():
    col = ColumnProfile("test")
    vals = [1, 1.0, 2, 3, 4, 5, 6, 6.0, "text"]
    for v in vals:
        col.track(v)
    col.cardinality_tracker = HllSketch()

    summary = col.to_summary()
    assert summary.unique_count.estimate == summary.number_summary.unique_count.estimate
示例#17
0
def test_fallback_fallbacks_to_number_counter():
    col = ColumnProfile("test")
    vals = ["a", "b", 1.0, 2.0]
    for v in vals:
        col.track(v)
    col.cardinality_tracker = HllSketch()

    summary = col.to_summary()
    assert summary.unique_count.estimate == summary.number_summary.unique_count.estimate
示例#18
0
def test_copy_counters_null_count_in_schema_tracker():
    col = ColumnProfile("test")
    vals = ["a", "b", None, "d", pd.NA, "f", 1.0, 2.0]
    for v in vals:
        col.track(v)
    assert col.schema_tracker.get_count(InferredType.Type.NULL) == 2

    # ensuring we can still access the value in summary mode
    assert col.to_summary().counters.null_count.value == 2

    # Mimic a legal protobuf with null_count set
    msg: ColumnMessage = col.to_protobuf()
    msg.counters.null_count.value = 2

    roundtrip = ColumnProfile.from_protobuf(msg)
    assert roundtrip.schema_tracker.get_count(InferredType.Type.NULL) == 4
示例#19
0
 def _do_merge(self, other):
     columns_set = set(
         list(self.columns.keys()) + list(other.columns.keys()))
     columns = {}
     for col_name in columns_set:
         empty_column = ColumnProfile(col_name)
         this_column = self.columns.get(col_name, empty_column)
         other_column = other.columns.get(col_name, empty_column)
         columns[col_name] = this_column.merge(other_column)
     return DatasetProfile(
         name=self.name,
         session_id=self.session_id,
         session_timestamp=self.session_timestamp,
         dataset_timestamp=self.dataset_timestamp,
         columns=columns,
         tags=self.tags,
         metadata=self.metadata,
     )
示例#20
0
def test_protobuf():
    c = ColumnProfile("col")
    for val in [1, 2, 3]:
        c.track(val)
    msg = c.to_protobuf()
    c1 = ColumnProfile.from_protobuf(msg)
    assert c1.column_name == c.column_name == "col"
    assert hasattr(c1, "number_tracker")
    assert hasattr(c1, "string_tracker")
    assert c1.string_tracker.length is not None

    assert c1.string_tracker.length.count == 0
    assert len(c1.string_tracker.char_pos_tracker.character_list) == 56
    c1.to_protobuf()
示例#21
0
    def merge(self, other):
        """
        Merge this profile with another dataset profile object.

        This operation will drop the metadata from the 'other' profile object.

        Parameters
        ----------
        other : DatasetProfile

        Returns
        -------
        merged : DatasetProfile
            New, merged DatasetProfile
        """
        self.validate()
        other.validate()

        assert self.session_id == other.session_id
        assert self.session_timestamp == other.session_timestamp
        assert self.dataset_timestamp == other.dataset_timestamp
        assert self.tags == other.tags

        columns_set = set(
            list(self.columns.keys()) + list(other.columns.keys()))
        columns = {}
        for col_name in columns_set:
            empty_column = ColumnProfile(col_name)
            this_column = self.columns.get(col_name, empty_column)
            other_column = other.columns.get(col_name, empty_column)
            columns[col_name] = this_column.merge(other_column)

        return DatasetProfile(
            name=self.name,
            session_id=self.session_id,
            session_timestamp=self.session_timestamp,
            dataset_timestamp=self.dataset_timestamp,
            columns=columns,
            tags=self.tags,
            metadata=self.metadata,
        )
def test_protobuf():
    c = ColumnProfile("col")
    for val in [1, 2, 3]:
        c.track(val)
    msg = c.to_protobuf()
    c1 = ColumnProfile.from_protobuf(msg)
    assert c1.column_name == c.column_name == "col"
    assert hasattr(c1, "number_tracker")
    msg2 = c1.to_protobuf()
    # We cannot do a straight equality comparison for serialized frequent
    # strings objects
    compare_frequent_items(
        c1.number_tracker.frequent_numbers.get_frequent_items(),
        c.number_tracker.frequent_numbers.get_frequent_items(),
    )
    msg.numbers.frequent_numbers.sketch = bytes()
    msg2.numbers.frequent_numbers.sketch = bytes()
def test_summary():
    c = ColumnProfile("col")
    for n in [1, 2, 3]:
        c.track(n)
    summary = c.to_summary()
    actual_val = message_to_dict(summary)
    expected_val = {
        "counters": {"count": "3",},
        "schema": {
            "inferredType": {"type": "INTEGRAL", "ratio": 1.0},
            "typeCounts": {"INTEGRAL": "3"},
        },
        "numberSummary": {
            "count": "3",
            "min": 1.0,
            "max": 3.0,
            "mean": 2.0,
            "stddev": 1.0,
            "isDiscrete": False,
            "histogram": {
                "start": 1.0,
                "end": 3.0000003,
                "counts": ["3"],
                "max": 3.0,
                "min": 1.0,
                "bins": [1.0, 3.0000003],
                "n": "3",
                "width": 0.0,
            },
            "quantiles": {
                "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0],
                "quantileValues": [1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 3.0, 3.0, 3.0],
            },
            "uniqueCount": {"estimate": 3.0, "upper": 3.0, "lower": 3.0},
        },
    }
    # Top-level unique count needs to be approximately equal
    expected_unique = {
        "estimate": 3.000000014901161,
        "lower": 3.0,
        "upper": 3.0001498026537594,
    }
    actual_unique = actual_val.pop("uniqueCount")
    assert actual_unique == pytest.approx(expected_unique, 0.0001)

    # Cannot do a straightforward comparison of frequent number counts, since
    # their orders can vary
    actual_freq = actual_val["numberSummary"]["frequentNumbers"]
    actual_val["numberSummary"].pop("frequentNumbers")
    counts = []
    for num_list in (actual_freq["longs"], actual_freq["doubles"]):
        for xi in num_list:
            val = xi["value"]
            if isinstance(val, str):
                # Parse JSON encoded int64
                val = json.loads(val)
            count = xi["estimate"]
            if isinstance(count, str):
                # Parse JSON encoded int64
                count = json.loads(count)
            counts.append((val, count))
    expected_counts = {(1, 1), (2, 1), (3, 1)}
    assert len(counts) == len(expected_counts)
    assert set(counts) == expected_counts

    # Cannot do a straightforward frequentItems count since order is ambiguous
    actual_freq = actual_val.pop("frequentItems")
    assert set(actual_freq.keys()) == {"items"}
    expected = [("1", "1"), ("2", "1"), ("3", "1")]
    assert len(actual_freq["items"]) == len(expected)
    counts = []
    for v in actual_freq["items"]:
        counts.append((v["jsonValue"], v["estimate"]))
    assert set(counts) == set(expected)

    # Compare the messages, excluding the frequent numbers counters
    assert actual_val == expected_val