def test_merge_different_columns(): now = datetime.datetime.utcnow() shared_session_id = uuid4().hex x1 = DatasetProfile( name="test", session_id=shared_session_id, session_timestamp=now, tags={"key": "value"}, metadata={"key": "x1"}, ) x1.track("col1", "value") x2 = DatasetProfile( name="test", session_id=shared_session_id, session_timestamp=now, tags={"key": "value"}, metadata={"key": "x2"}, ) x2.track("col2", "value") merged = x1.merge(x2) assert merged.name == "test" assert merged.session_id == shared_session_id assert merged.session_timestamp == now assert set(list(merged.columns.keys())) == {"col1", "col2"} assert merged.columns["col1"].counters.count == 1 assert merged.columns["col2"].counters.count == 1 assert merged.tags == dict({"Name": "test", "key": "value"}) assert merged.metadata == dict({"key": "x1"})
def test_protobuf_round_trip(): now = datetime.datetime.utcnow() tags = {"k1": "rock", "k2": "scissors", "k3": "paper"} original = DatasetProfile( name="test", dataset_timestamp=now, tags=tags, ) original.track("col1", "value") original.track("col2", "value") msg = original.to_protobuf() roundtrip = DatasetProfile.from_protobuf(msg) assert roundtrip.to_protobuf() == msg assert roundtrip.name == "test" assert roundtrip.session_id == original.session_id assert to_utc_ms(roundtrip.session_timestamp) == to_utc_ms( original.session_timestamp) assert set(list(roundtrip.columns.keys())) == {"col1", "col2"} assert roundtrip.columns["col1"].counters.count == 1 assert roundtrip.columns["col2"].counters.count == 1 tags["Name"] = "test" assert set(roundtrip.tags) == set(tags) assert roundtrip.metadata == original.metadata
def test_write_delimited_multiple(): now = datetime.datetime.utcnow() original = DatasetProfile( name="test", session_id="test.session.id", session_timestamp=now, tags={"key": "value"}, metadata={"key": "value"}, ) original.track("col1", "value") output_bytes = original.serialize_delimited() multiple_entries = output_bytes for i in range(1, 5): multiple_entries += output_bytes entries = DatasetProfile.parse_delimited(multiple_entries) assert len(entries) == 5 for entry in entries: assert entry.session_id == original.session_id # Python time precisions are different assert time.to_utc_ms(entry.session_timestamp) == time.to_utc_ms( original.session_timestamp) assert entry.tags == original.tags assert entry.metadata == original.metadata
def test_errors(): now = datetime.datetime.utcnow() original = DatasetProfile(name="test", session_id="test.session.id", session_timestamp=now, tags={ "key": "value"}, metadata={"key": "value"},) with pytest.raises(TypeError): original.track(columns=1, data=34)
def test_viz(): now = datetime.datetime.utcnow() session_id = uuid4().hex x1 = DatasetProfile(name="test", session_id=session_id, session_timestamp=now, tags={"key": "value"}, metadata={"key": "value"},) x1.track("col1", "value") viz = ProfileVisualizer() viz.available_plots() viz.set_profiles([x1])
def test_track(): now = datetime.datetime.utcnow() original = DatasetProfile(name="test", session_id="test.session.id", session_timestamp=now, tags={ "key": "value"}, metadata={"key": "value"},) data = { "rows": 1, "names": "roger roger", } original.track(columns=data)
def test_chunk_iterator(): now = datetime.datetime.utcnow() original = DatasetProfile(name="test", session_id="test.session.id", session_timestamp=now, tags={ "key": "value"}, metadata={"key": "value"},) data = { "rows": 1, "names": "roger roger", } original.track(columns=data) for each_chuck in original.chunk_iterator(): assert each_chuck is not None
def test_write_delimited_single(): now = datetime.datetime.utcnow() original = DatasetProfile(name="test", session_id="test.session.id", session_timestamp=now, tags={ "key": "value"}, metadata={"key": "value"},) original.track("col1", "value") output_bytes = original.serialize_delimited() pos, roundtrip = DatasetProfile.parse_delimited_single(output_bytes) assert roundtrip.session_id == original.session_id # Python time precision includes nanoseconds assert time.to_utc_ms(roundtrip.session_timestamp) == time.to_utc_ms( original.session_timestamp) assert roundtrip.tags == original.tags assert roundtrip.metadata == original.metadata
def test_merge_same_columns(): now = datetime.datetime.now(datetime.timezone.utc) shared_session_id = uuid4().hex x1 = DatasetProfile( name="test", session_id=shared_session_id, session_timestamp=now, tags={"key": "value"}, metadata={"key": "value"}, ) x1.track("col1", "value1") x2 = DatasetProfile( name="test", session_id=shared_session_id, session_timestamp=now, tags={"key": "value"}, metadata={"key": "value"}, ) x2.track("col1", "value1") x2.track("col2", "value") merged = x1.merge(x2) assert merged.name == "test" assert merged.session_id == shared_session_id assert merged.session_timestamp == now assert set(list(merged.columns.keys())) == {"col1", "col2"} assert merged.columns["col1"].counters.count == 2 assert merged.columns["col2"].counters.count == 1
def test_track_null_item(): prof = DatasetProfile("name") prof.track("column_name", 1) prof = DatasetProfile("name") prof.track("column_name", None) assert prof.flat_summary()["summary"]["column"][0] == "column_name" assert prof.flat_summary()["summary"]["null_count"][0] == 1 prof.track("column_name", None) assert prof.flat_summary()["summary"]["null_count"][0] == 2 assert prof.flat_summary()["summary"]["column"][0] == "column_name"