def test_log_pil_image(tmpdir, image_files): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path, ignore_errors=True) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) with session.logger("image_pil_test", with_rotation_time="1m", cache_size=1) as logger: for image_file_path in image_files: img = Image.open(image_file_path) logger.log_image(img) profile = logger.profile columns = profile.columns for column_name in _EXPECTED_COLUMNS: assert column_name in columns, f"{column_name} not found in {columns}" shutil.rmtree(output_path, ignore_errors=True)
def test_log_metrics_with_boolean_labels(tmpdir): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path, ignore_errors=True) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) targets = [True, False, True] predictions = [False, True, False] scores = [0.2, 0.5, 0.6] with session.logger("metrics_test") as logger: logger.log_metrics(targets, predictions, scores) profile = logger.profile metrics_profile = profile.model_profile assert metrics_profile is not None assert len(metrics_profile.metrics.confusion_matrix.labels) == 2 shutil.rmtree(output_path, ignore_errors=True)
def test_s3_writer_metadata(df_lending_club, moto_boto, s3_all_config_metadata_path): assert os.path.exists(s3_all_config_metadata_path) config = load_config(s3_all_config_metadata_path) session = session_from_config(config) session.estimate_segments(df_lending_club, name="dataset_test", target_field="funded_amnt_inv", max_segments=30) client = boto3.client("s3") objects = client.list_objects(Bucket="mocked_bucket") for idx, each_objc in enumerate(objects["Contents"]): assert each_objc["Key"] == "metadata/segments.json" with session.logger("dataset_test") as logger: logger.log_dataframe(df_lending_club) session.close() objects = client.list_objects(Bucket="mocked_bucket") print(objects) for idx, each_objc in enumerate(objects["Contents"]): print(each_objc["Key"]) assert each_objc["Key"] == object_keys_meta_config[idx]
def profile_csv(session_config: SessionConfig, project_dir: str) -> str: package_nb_path = os.path.join(os.path.dirname(__file__), "notebooks") demo_csv = os.path.join(package_nb_path, LENDING_CLUB_CSV) file: io.TextIOWrapper = click.prompt( "CSV input path (leave blank to use our demo dataset)", type=click.File(mode="rt"), default=io.StringIO(), show_default=False, ) if type(file) is io.StringIO: echo("Using the demo Lending Club Data (1K randomized samples)", fg="green") destination_csv = os.path.join(project_dir, LENDING_CLUB_CSV) echo("Copying the demo file to: %s" % destination_csv) shutil.copy(demo_csv, destination_csv) full_input = os.path.realpath(destination_csv) else: file.close() full_input = os.path.realpath(file.name) echo(f"Input file: {full_input}") echo(RUN_PROFILING) session = session_from_config(session_config) df = pd.read_csv(full_input) session.log_dataframe(df) session.close() return full_input
def test_log_metrics(tmpdir): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) targets = ["class_name1", "class_name2", "class_name3"] predictions = ["class_name1", "class_name2", "class_name2"] scores = [0.2, 0.5, 0.6] num_labels = 3 with session.logger("metrics_test") as logger: logger.log_metrics(targets, predictions, scores) profile = logger.profile metrics_profile = profile.model_profile assert metrics_profile is not None assert len( metrics_profile.metrics.confusion_matrix.labels) == num_labels shutil.rmtree(output_path)
def test_segments_with_rotation(df_lending_club, tmpdir): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig( "project", "pipeline", writers=[writer_config]) with freeze_time("2012-01-14 03:21:34", tz_offset=-4) as frozen_time: session = session_from_config(session_config) with session.logger("test", with_rotation_time='s', segments=["home_ownership"], profile_full_dataset=True, cache=1) as logger: logger.log_dataframe(df_lending_club) frozen_time.tick(delta=datetime.timedelta(seconds=1)) logger.log_dataframe(df_lending_club) frozen_time.tick(delta=datetime.timedelta(seconds=1)) df = util.testing.makeDataFrame() with pytest.raises(KeyError): logger.log_dataframe(df) output_files = [] for root, subdirs, files in os.walk(output_path): output_files += files assert len(output_files) == 8 shutil.rmtree(output_path)
def test_log_rotation_days(tmpdir): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) with freeze_time("2012-01-14 03:21:34", tz_offset=-4) as frozen_time: session = session_from_config(session_config) with session.logger("test", with_rotation_time='d', cache_size=1) as logger: df = util.testing.makeDataFrame() logger.log_dataframe(df) frozen_time.tick(delta=datetime.timedelta(days=1)) df = util.testing.makeDataFrame() logger.log_dataframe(df) df = util.testing.makeDataFrame() logger.log_dataframe(df) frozen_time.tick(delta=datetime.timedelta(days=2)) df = util.testing.makeDataFrame() logger.log_dataframe(df) output_files = [] for root, subdirs, files in os.walk(output_path): output_files += files assert len(output_files) == 3 shutil.rmtree(output_path)
def test_log_multiple_calls(tmpdir, df_lending_club): original_dir = os.curdir os.chdir(script_dir) p = tmpdir.mkdir("whylogs") writer_config = WriterConfig("local", ["protobuf", "flat"], p.realpath( ), filename_template="dataset_summary-$dataset_timestamp") yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig( "project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) now = datetime.datetime.now() for i in range(0, 5): with session.logger(dataset_timestamp=now + datetime.timedelta(days=i)) as logger: logger.log_dataframe(df_lending_club) output_files = [] for root, subdirs, files in os.walk(p): output_files += files # we run 5 times, so we should have five times more files than the above test assert len(output_files) == 25 os.chdir(original_dir)
def test_log_dataframe(tmpdir, df_lending_club): p = tmpdir.mkdir("whylogs") writer_config = WriterConfig("local", ["protobuf", "flat"], p.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig( "project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) with session.logger("lendingclub") as logger: assert logger is not None logger.log_dataframe(df_lending_club) profile = logger.profile assert profile is not None summary = profile.flat_summary() flat_summary = summary['summary'] assert len(flat_summary) == 151 output_files = [] for root, subdirs, files in os.walk(p): output_files += files assert len(output_files) == 5
def test_session_log_dataframe(df): pass session = session_from_config( SessionConfig("default-project", "default-pipeline", [], False)) session.log_dataframe(df) assert session.logger() is not None assert session.logger("default-project").dataset_name == "default-project"
def test_session_log_dataframe(): _session = None session = session_from_config( SessionConfig("default-project", "default-pipeline", [], False)) df = util.testing.makeDataFrame() profile = session.log_dataframe(df) assert session.logger() is not None assert session.logger("default-project").dataset_name == "default-project"
def test_session_profile(df): session = session_from_config( SessionConfig("default-project", "default-pipeline", [], False)) profile = session.log_dataframe(df) assert profile is not None summary = profile.flat_summary() flat_summary = summary["summary"] assert len(flat_summary) == 4
def test_session_profile(): session = session_from_config( SessionConfig("default-project", "default-pipeline", [], False)) df = util.testing.makeDataFrame() profile = session.log_dataframe(df) assert profile is not None summary = profile.flat_summary() flat_summary = summary['summary'] assert len(flat_summary) == 4
def test_segments(df_lending_club, tmpdir): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path, ignore_errors=True) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) with session_from_config(session_config) as session: with session.logger( "test", segments=[ [{ "key": "home_ownership", "value": "RENT" }], [{ "key": "home_ownership", "value": "MORTGAGE" }], ], cache_size=1, ) as logger: logger.log_dataframe(df_lending_club) profile = logger.profile profiles = logger.segmented_profiles mortage_segment = logger.get_segment([{ "key": "home_ownership", "value": "MORTGAGE" }]) assert profile is None assert len(profiles) == 2 assert profiles[list(profiles.keys())[0]].tags["segment"] == json.dumps([{ "key": "home_ownership", "value": "RENT" }]) assert profiles[list(profiles.keys())[1]].tags["segment"] == json.dumps([{ "key": "home_ownership", "value": "MORTGAGE" }]) check_segment = profiles[list(profiles.keys())[1]] assert mortage_segment == check_segment shutil.rmtree(output_path, ignore_errors=True)
def test_config_api(tmpdir): p = tmpdir.mkdir("whylogs") writer_config = WriterConfig("local", ["protobuf", "flat"], p.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) with session.logger("test_name") as logger: logger.log_dataframe(pd.DataFrame()) session.close()
def test_segments_keys(df_lending_club, tmpdir): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path, ignore_errors=True) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) with session.logger("test", segments=["emp_title", "home_ownership"], cache_size=1) as logger: logger.log_dataframe(df_lending_club) profiles = logger.segmented_profiles assert len(profiles) == 47 shutil.rmtree(output_path, ignore_errors=True)
def test_log_multiple_segments(tmpdir): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path, ignore_errors=True) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) df = pd.DataFrame(data={"x": [1, 2, 3, 1, 2, 3, 1, 2, 3], "y": [4, 5, 6, 5, 6, 4, 6, 4, 5], "z": [0.1, 0.2, 0.3, 0.1, 0.2, 0.3, 0.1, 0.2, 0.3]}) with session.logger("image_test", segments=["x", "y"]) as logger: logger.log_segments(df) assert len(logger.segmented_profiles) == 9
def test_s3_writer(df_lending_club, moto_boto, s3_all_config_path): assert os.path.exists(s3_all_config_path) config = load_config(s3_all_config_path) session = session_from_config(config) with session.logger("dataset_test_s3") as logger: logger.log_dataframe(df_lending_club) session.close() client = boto3.client("s3") objects = client.list_objects(Bucket="mocked_bucket") for idx, each_objc in enumerate(objects["Contents"]): assert each_objc["Key"] == object_keys[idx]
def test_mlflow_patched(mlflow_config_path): import mlflow import whylogs from whylogs.app.config import load_config from whylogs.app.session import session_from_config assert os.path.exists(mlflow_config_path) config = load_config(mlflow_config_path) session = session_from_config(config) assert whylogs.enable_mlflow(session) assert mlflow.whylogs is not None print("HEY LISTEN") whylogs.mlflow.disable_mlflow()
def test_s3_writer(df_lending_club, moto_boto, s3_config_path): assert os.path.exists(s3_config_path) config = load_config(s3_config_path) session = session_from_config(config) with session.logger("dataset_test_s3") as logger: logger.log_dataframe(df_lending_club) client = boto3.client('s3') objects = client.list_objects(Bucket="mocked_bucket") assert len([each_obj["Key"] for each_obj in objects["Contents"]]) == 1 assert objects["Contents"][0]["Key"] == "dataset_test_s3/dataset_summary/protobuf/dataset_summary.bin" assert "s3:" not in [d.name for d in os.scandir( os.getcwd()) if d.is_dir()]
def test_profile_viewer(tmpdir, local_config_path): config = load_config(local_config_path) session = session_from_config(config) with session.logger("mytestytest", dataset_timestamp=datetime.datetime(2021, 6, 2)) as logger: for _ in range(5): logger.log({"uniform_integers": np.random.randint(0, 50)}) logger.log({"nulls": None}) profile = logger.profile result = profile_viewer(profiles=[profile], output_path=tmpdir + "my_test.html") assert os.path.exists(tmpdir + "my_test.html") assert result == tmpdir + "my_test.html"
def test_patch_multiple_times(mlflow_config_path): import whylogs from whylogs.app.config import load_config from whylogs.app.session import session_from_config assert os.path.exists(mlflow_config_path) config = load_config(mlflow_config_path) session = session_from_config(config) # patch three times assert whylogs.enable_mlflow(session) assert whylogs.enable_mlflow(session) assert whylogs.enable_mlflow(session) import mlflow assert mlflow.whylogs is not None whylogs.mlflow.disable_mlflow()
def test_get_run_profiles_shouldReturn_multipleProfiles( tmpdir, mlflow_config_path): import mlflow import whylogs from whylogs.app.config import load_config from whylogs.app.session import session_from_config assert os.path.exists(mlflow_config_path) config = load_config(mlflow_config_path) session = session_from_config(config) set_up_mlflow(mlflow, tmpdir) whylogs.enable_mlflow(session) with mlflow.start_run(): mlflow.whylogs.log(features={"a": 1}) mlflow.whylogs.log(features={"a": 1}, dataset_name="another-profile") with mlflow.start_run(): mlflow.whylogs.log(features={"a": 1}, dataset_name="another-profile") runs = whylogs.mlflow.list_whylogs_runs("0") default_profiles = whylogs.mlflow.get_run_profiles(run_id=runs[0].run_id) another_profile = whylogs.mlflow.get_run_profiles( run_id=runs[0].run_id, dataset_name="another-profile") assert len(runs) == 2 # verify the number of profiles for each datasetname assert len( whylogs.mlflow.get_experiment_profiles("0", dataset_name="default")) == 2 assert len( whylogs.mlflow.get_experiment_profiles( "0", dataset_name="another-profile")) == 2 # for the first run, verify content assert len(default_profiles) == 1 assert len(another_profile) == 1 # assert default_profiles[0].name == "default" assert default_profiles[0].dataset_timestamp is not None assert another_profile[0].dataset_timestamp is not None
def test_assert_whylogsrun_close_is_called(tmpdir, mlflow_config_path): import mlflow import whylogs from whylogs.app.config import load_config from whylogs.app.session import session_from_config assert os.path.exists(mlflow_config_path) config = load_config(mlflow_config_path) session = session_from_config(config) set_up_mlflow(mlflow, tmpdir) with mock.patch.object(whylogs.mlflow.patcher.WhyLogsRun, "_close") as mock_close: whylogs.enable_mlflow(session) with mlflow.start_run(): pass mock_close.assert_called_once() whylogs.mlflow.disable_mlflow()
def test_assert_log_artifact_is_called(tmpdir, mlflow_config_path): import mlflow import whylogs from whylogs.app.config import load_config from whylogs.app.session import session_from_config assert os.path.exists(mlflow_config_path) config = load_config(mlflow_config_path) session = session_from_config(config) set_up_mlflow(mlflow, tmpdir) with mock.patch.object(mlflow, "log_artifact") as log_artifact: whylogs.enable_mlflow(session) with mlflow.start_run(): mlflow.whylogs.log(features={"a": 1}) log_artifact.assert_called_once() whylogs.mlflow.disable_mlflow()
def test_log_rotation_hour(tmpdir, df): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path, ignore_errors=True) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) with freeze_time("2012-01-14 03:21:34", tz_offset=-4) as frozen_time: with session_from_config(session_config) as session: with session.logger("test", with_rotation_time="h", cache_size=1) as logger: logger.log_dataframe(df) frozen_time.tick(delta=datetime.timedelta(hours=3)) logger.log(feature_name="E", value=4) logger.log_dataframe(df) output_files = [] for _, _, files in os.walk(output_path): output_files += files assert len(output_files) == 2 shutil.rmtree(output_path, ignore_errors=True)
def test_log_image(tmpdir, image_files): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path, ignore_errors=True) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) with session.logger("image_test") as logger: for image_file_path in image_files: logger.log_image(image_file_path) profile = logger.profile columns = profile.columns assert len(columns) == 19 shutil.rmtree(output_path, ignore_errors=True)
def test_listRuns_shouldReturn_NoRuns(tmpdir, mlflow_config_path): import mlflow import whylogs from whylogs.app.config import load_config from whylogs.app.session import session_from_config assert os.path.exists(mlflow_config_path) config = load_config(mlflow_config_path) session = session_from_config(config) set_up_mlflow(mlflow, tmpdir) whylogs.enable_mlflow(session) for i in range(0, 10): with mlflow.start_run(): pass assert len(mlflow.list_run_infos("0")) == 10 assert len(whylogs.mlflow.list_whylogs_runs("0")) == 0 whylogs.mlflow.disable_mlflow()
def test_listRuns_shouldReturn_CorrectRunCount(tmpdir, mlflow_config_path): import mlflow import whylogs from whylogs.app.config import load_config from whylogs.app.session import session_from_config assert os.path.exists(mlflow_config_path) config = load_config(mlflow_config_path) session = session_from_config(config) set_up_mlflow(mlflow, tmpdir) whylogs.enable_mlflow(session) for i in range(0, 10): with mlflow.start_run(): if i % 2 == 0: mlflow.whylogs.log(features={"a": 1}) print("WEIRD") assert len(mlflow.list_run_infos("0")) == 10 assert len(whylogs.mlflow.list_whylogs_runs("0")) == 5 assert len(whylogs.mlflow.get_experiment_profiles("0")) == 5 whylogs.mlflow.disable_mlflow()
def test_log_pil_image(tmpdir, image_files): output_path = tmpdir.mkdir("whylogs") shutil.rmtree(output_path) writer_config = WriterConfig("local", ["protobuf"], output_path.realpath()) yaml_data = writer_config.to_yaml() WriterConfig.from_yaml(yaml_data) session_config = SessionConfig("project", "pipeline", writers=[writer_config]) session = session_from_config(session_config) with session.logger("image_pil_test", with_rotation_time="s", cache_size=1) as logger: for image_file_path in image_files: img = Image.open(image_file_path) logger.log_image(img) profile = logger.profile columns = profile.columns assert len(columns) == 19 shutil.rmtree(output_path)