def test_message_to_dict_returns_default_values(): msg1 = DoublesMessage(min=0, max=0, sum=0, count=10) d1 = protobuf.message_to_dict(msg1) msg2 = DoublesMessage(count=10) d2 = protobuf.message_to_dict(msg2) true_val = { "min": 0.0, "max": 0.0, "sum": 0.0, "count": "10", } assert d1 == true_val assert d2 == true_val
def test_summary(): import pandas as pd x = StringTracker() data = ["one", "two", "three", "one", "one", "One", "six", None, None] for record in data: x.update(record) # Check the full output. NOTE: the order of the "items" below should # really be arbitrary expected = { "uniqueCount": {"estimate": 5.0, "upper": 5.0, "lower": 5.0}, "frequent": { "items": [ {"value": "one", "estimate": 3.0}, {"value": "three", "estimate": 1.0}, {"value": "six", "estimate": 1.0}, {"value": "One", "estimate": 1.0}, {"value": "two", "estimate": 1.0}, ] }, } expected_items = pd.DataFrame(expected["frequent"]["items"]).sort_values(["value", "estimate"]) expected["frequent"].pop("items") actual = message_to_dict(x.to_summary()) actual_items = pd.DataFrame(actual["frequent"]["items"]).sort_values(["value", "estimate"]) actual["frequent"].pop("items") assert expected == actual pd.testing.assert_frame_equal( actual_items.reset_index(drop=True).sort_index(axis=1), expected_items.reset_index(drop=True).sort_index(axis=1), )
def test_summary(): c = ColumnProfile("col") for n in [1, 2, 3]: c.track(n) summary = c.to_summary() actual_val = message_to_dict(summary) expected_val = { "counters": { "count": "3", }, "schema": { "inferredType": {"type": "INTEGRAL", "ratio": 1.0}, "typeCounts": {"INTEGRAL": "3"}, }, "numberSummary": { "count": "3", "min": 1.0, "max": 3.0, "mean": 2.0, "stddev": 1.0, "isDiscrete": False, "histogram": { "start": 1.0, "end": 3.0000003, "counts": ["3"], "max": 3.0, "min": 1.0, "bins": [1.0, 3.0000003], "n": "3", "width": 0.0, }, "quantiles": { "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0], "quantileValues": [1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 3.0, 3.0, 3.0], }, "uniqueCount": {"estimate": 3.0, "upper": 3.0, "lower": 3.0}, }, } # Top-level unique count needs to be approximately equal expected_unique = { "estimate": 3.000000014901161, "lower": 3.0, "upper": 3.0001498026537594, } actual_unique = actual_val.pop("uniqueCount") assert actual_unique == pytest.approx(expected_unique, 0.0001) # Cannot do a straightforward frequentItems count since order is ambiguous actual_freq = actual_val.pop("frequentItems") assert set(actual_freq.keys()) == {"items"} expected = [("1", "1"), ("2", "1"), ("3", "1")] assert len(actual_freq["items"]) == len(expected) counts = [] for v in actual_freq["items"]: counts.append((v["jsonValue"], v["estimate"])) assert set(counts) == set(expected) # Compare the messages, excluding the frequent numbers counters assert actual_val == expected_val
def test_all_zeros_returns_summary_with_stats(): stats = ("min", "max", "stddev", "mean") array = np.zeros([100, 1]) prof = array_profile(array) msg = prof.to_summary() d = message_to_dict(msg) d1 = json.loads(message_to_json(msg)) number_summary = d["columns"]["0"]["numberSummary"] missing_stats = [k for k in stats if k not in number_summary] if len(missing_stats) > 0: raise RuntimeError(f"Stats missing from number summary: {missing_stats}") assert d == d1
def test_summary(): c = ColumnProfile("col") for n in [1, 2, 3]: c.track(n) summary = c.to_summary() actual_val = message_to_dict(summary) expected_val = { "counters": {"count": "3",}, "schema": { "inferredType": {"type": "INTEGRAL", "ratio": 1.0}, "typeCounts": {"INTEGRAL": "3"}, }, "numberSummary": { "count": "3", "min": 1.0, "max": 3.0, "mean": 2.0, "stddev": 1.0, "isDiscrete": False, "histogram": { "start": 1.0, "end": 3.0000003, "counts": ["3"], "max": 3.0, "min": 1.0, "bins": [1.0, 3.0000003], "n": "3", "width": 0.0, }, "quantiles": { "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0], "quantileValues": [1.0, 1.0, 1.0, 1.0, 2.0, 3.0, 3.0, 3.0, 3.0], }, "uniqueCount": {"estimate": 3.0, "upper": 3.0, "lower": 3.0}, }, } # Top-level unique count needs to be approximately equal expected_unique = { "estimate": 3.000000014901161, "lower": 3.0, "upper": 3.0001498026537594, } actual_unique = actual_val.pop("uniqueCount") assert actual_unique == pytest.approx(expected_unique, 0.0001) # Cannot do a straightforward comparison of frequent number counts, since # their orders can vary actual_freq = actual_val["numberSummary"]["frequentNumbers"] actual_val["numberSummary"].pop("frequentNumbers") counts = [] for num_list in (actual_freq["longs"], actual_freq["doubles"]): for xi in num_list: val = xi["value"] if isinstance(val, str): # Parse JSON encoded int64 val = json.loads(val) count = xi["estimate"] if isinstance(count, str): # Parse JSON encoded int64 count = json.loads(count) counts.append((val, count)) expected_counts = {(1, 1), (2, 1), (3, 1)} assert len(counts) == len(expected_counts) assert set(counts) == expected_counts # Cannot do a straightforward frequentItems count since order is ambiguous actual_freq = actual_val.pop("frequentItems") assert set(actual_freq.keys()) == {"items"} expected = [("1", "1"), ("2", "1"), ("3", "1")] assert len(actual_freq["items"]) == len(expected) counts = [] for v in actual_freq["items"]: counts.append((v["jsonValue"], v["estimate"])) assert set(counts) == set(expected) # Compare the messages, excluding the frequent numbers counters assert actual_val == expected_val
def test_message_to_dict_equals_message_to_json(): msg = DoublesMessage(min=0, max=1.0, sum=2.0, count=10) d1 = protobuf.message_to_dict(msg) d2 = json.loads(protobuf.message_to_json(msg)) assert d1 == d2
def test_summary(): import pandas as pd x = StringTracker() data = ["one", "two", "three", "one", "one", "One", "six", None, None] for record in data: x.update(record) # Check the full output. NOTE: the order of the "items" below should # really be arbitrary expected = { "uniqueCount": {"estimate": 5.0, "upper": 5.0, "lower": 5.0}, "frequent": { "items": [ {"value": "one", "estimate": 3.0}, {"value": "three", "estimate": 1.0}, {"value": "six", "estimate": 1.0}, {"value": "One", "estimate": 1.0}, {"value": "two", "estimate": 1.0}, ] }, "length": { "count": "7", "min": 3.0, "max": 5.0, "mean": 3.2857142857142856, "stddev": 0.7559289460184544, "histogram": { "start": 3.0, "end": 5.0000005, "counts": ["6", "1"], "max": 5.0, "min": 3.0, "bins": [3.0, 4.000000249999999, 5.0000005], "n": "7", "width": 0.0, }, "uniqueCount": {"estimate": 2.0, "upper": 2.0, "lower": 2.0}, "quantiles": {"quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0], "quantileValues": [3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 5.0, 5.0, 5.0]}, "frequentNumbers": {"longs": [{"estimate": "6", "value": "3", "rank": 0}, {"estimate": "1", "value": "5", "rank": 1}], "doubles": []}, "isDiscrete": False, }, "tokenLength": { "count": "7", "min": 1.0, "max": 1.0, "mean": 1.0, "histogram": {"start": 1.0, "end": 1.0000001, "counts": ["7"], "max": 1.0, "min": 1.0, "bins": [1.0, 1.0000001], "n": "7", "width": 0.0}, "uniqueCount": {"estimate": 1.0, "upper": 1.0, "lower": 1.0}, "quantiles": {"quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0], "quantileValues": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0]}, "frequentNumbers": {"longs": [{"estimate": "7", "value": "1", "rank": 0}], "doubles": []}, "stddev": 0.0, "isDiscrete": False, }, "charPosTracker": { "characterList": "!#$%&()*+,-./0123456789?@[]^_abcdefghijklmnopqrstuvwyz{}", "charPosMap": { "i": { "count": "1", "min": 1.0, "max": 1.0, "mean": 1.0, "histogram": {"start": 1.0, "end": 1.0000001, "counts": ["1"], "max": 1.0, "min": 1.0, "bins": [1.0, 1.0000001], "n": "1", "width": 0.0}, "uniqueCount": {"estimate": 1.0, "upper": 1.0, "lower": 1.0}, "quantiles": { "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0], "quantileValues": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], }, "frequentNumbers": {"longs": [{"estimate": "1", "value": "1", "rank": 0}], "doubles": []}, "stddev": 0.0, "isDiscrete": False, }, "t": { "count": "2", "histogram": {"counts": ["2"], "bins": [0.0, 0.0], "n": "2", "start": 0.0, "end": 0.0, "width": 0.0, "max": 0.0, "min": 0.0}, "uniqueCount": {"estimate": 1.0, "upper": 1.0, "lower": 1.0}, "quantiles": { "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0], "quantileValues": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], }, "frequentNumbers": {"longs": [{"estimate": "2", "value": "0", "rank": 0}], "doubles": []}, "min": 0.0, "max": 0.0, "mean": 0.0, "stddev": 0.0, "isDiscrete": False, }, "s": { "count": "1", "histogram": {"counts": ["1"], "bins": [0.0, 0.0], "n": "1", "start": 0.0, "end": 0.0, "width": 0.0, "max": 0.0, "min": 0.0}, "uniqueCount": {"estimate": 1.0, "upper": 1.0, "lower": 1.0}, "quantiles": { "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0], "quantileValues": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], }, "frequentNumbers": {"longs": [{"estimate": "1", "value": "0", "rank": 0}], "doubles": []}, "min": 0.0, "max": 0.0, "mean": 0.0, "stddev": 0.0, "isDiscrete": False, }, "n": { "count": "4", "min": 1.0, "max": 1.0, "mean": 1.0, "histogram": {"start": 1.0, "end": 1.0000001, "counts": ["4"], "max": 1.0, "min": 1.0, "bins": [1.0, 1.0000001], "n": "4", "width": 0.0}, "uniqueCount": {"estimate": 1.0, "upper": 1.0, "lower": 1.0}, "quantiles": { "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0], "quantileValues": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], }, "frequentNumbers": {"longs": [{"estimate": "4", "value": "1", "rank": 0}], "doubles": []}, "stddev": 0.0, "isDiscrete": False, }, "h": { "count": "1", "min": 1.0, "max": 1.0, "mean": 1.0, "histogram": {"start": 1.0, "end": 1.0000001, "counts": ["1"], "max": 1.0, "min": 1.0, "bins": [1.0, 1.0000001], "n": "1", "width": 0.0}, "uniqueCount": {"estimate": 1.0, "upper": 1.0, "lower": 1.0}, "quantiles": { "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0], "quantileValues": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], }, "frequentNumbers": {"longs": [{"estimate": "1", "value": "1", "rank": 0}], "doubles": []}, "stddev": 0.0, "isDiscrete": False, }, "o": { "count": "5", "max": 2.0, "mean": 0.4, "stddev": 0.894427190999916, "histogram": { "end": 2.0000002, "counts": ["4", "1"], "max": 2.0, "bins": [0.0, 1.0000001, 2.0000002], "n": "5", "start": 0.0, "width": 0.0, "min": 0.0, }, "uniqueCount": {"estimate": 2.0, "upper": 2.0, "lower": 2.0}, "quantiles": { "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0], "quantileValues": [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 2.0], }, "frequentNumbers": {"longs": [{"estimate": "4", "value": "0", "rank": 0}, {"estimate": "1", "value": "2", "rank": 1}], "doubles": []}, "min": 0.0, "isDiscrete": False, }, "NITL": { "count": "1", "min": 2.0, "max": 2.0, "mean": 2.0, "histogram": {"start": 2.0, "end": 2.0000002, "counts": ["1"], "max": 2.0, "min": 2.0, "bins": [2.0, 2.0000002], "n": "1", "width": 0.0}, "uniqueCount": {"estimate": 1.0, "upper": 1.0, "lower": 1.0}, "quantiles": { "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0], "quantileValues": [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0], }, "frequentNumbers": {"longs": [{"estimate": "1", "value": "2", "rank": 0}], "doubles": []}, "stddev": 0.0, "isDiscrete": False, }, "w": { "count": "1", "min": 1.0, "max": 1.0, "mean": 1.0, "histogram": {"start": 1.0, "end": 1.0000001, "counts": ["1"], "max": 1.0, "min": 1.0, "bins": [1.0, 1.0000001], "n": "1", "width": 0.0}, "uniqueCount": {"estimate": 1.0, "upper": 1.0, "lower": 1.0}, "quantiles": { "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0], "quantileValues": [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0], }, "frequentNumbers": {"longs": [{"estimate": "1", "value": "1", "rank": 0}], "doubles": []}, "stddev": 0.0, "isDiscrete": False, }, "e": { "count": "6", "min": 2.0, "max": 4.0, "mean": 2.5, "stddev": 0.8366600265340756, "histogram": { "start": 2.0, "end": 4.0000004, "counts": ["5", "1"], "max": 4.0, "min": 2.0, "bins": [2.0, 3.0000002, 4.0000004], "n": "6", "width": 0.0, }, "uniqueCount": {"estimate": 3.0, "upper": 3.0, "lower": 3.0}, "quantiles": { "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0], "quantileValues": [2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 4.0, 4.0, 4.0], }, "frequentNumbers": { "longs": [ {"estimate": "4", "value": "2", "rank": 0}, {"estimate": "1", "value": "4", "rank": 1}, {"estimate": "1", "value": "3", "rank": 2}, ], "doubles": [], }, "isDiscrete": False, }, "r": { "count": "1", "min": 2.0, "max": 2.0, "mean": 2.0, "histogram": {"start": 2.0, "end": 2.0000002, "counts": ["1"], "max": 2.0, "min": 2.0, "bins": [2.0, 2.0000002], "n": "1", "width": 0.0}, "uniqueCount": {"estimate": 1.0, "upper": 1.0, "lower": 1.0}, "quantiles": { "quantiles": [0.0, 0.01, 0.05, 0.25, 0.5, 0.75, 0.95, 0.99, 1.0], "quantileValues": [2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0], }, "frequentNumbers": {"longs": [{"estimate": "1", "value": "2", "rank": 0}], "doubles": []}, "stddev": 0.0, "isDiscrete": False, }, }, }, } expected_items = pd.DataFrame(expected["frequent"]["items"]).sort_values(["value", "estimate"]) expected["frequent"].pop("items") # removing items that due to their statisctical nature differ in different systems. Need to dig in to see if there is a way to fix the seeds so values dont change from mac os to ubuntu for char, value in expected["charPosTracker"]["charPosMap"].items(): value.pop("frequentNumbers") actual = message_to_dict(x.to_summary()) actual_items = pd.DataFrame(actual["frequent"]["items"]).sort_values(["value", "estimate"]) actual["frequent"].pop("items") # same as above, removing items that due to their statisctical nature differ in different systems. Need to dig in to see if there is a way to fix the seeds so values dont change from mac os to ubuntu for char, value in actual["charPosTracker"]["charPosMap"].items(): value.pop("frequentNumbers") assert expected == actual pd.testing.assert_frame_equal(actual_items.reset_index(drop=True).sort_index(axis=1), expected_items.reset_index(drop=True).sort_index(axis=1))