def test_failed_target_with_wrapper(self, mock_channel_tracker, pandas_data_frame): @task() def task_with_log_dataset_wrapper(): with dataset_op_logger( op_path=target("/path/to/value.csv"), data=pandas_data_frame, op_type="write", with_preview=True, ) as logger: ans = 42 ans / 0 try: task_with_log_dataset_wrapper() except Exception: pass log_dataset_arg = one(get_log_datasets(mock_channel_tracker)) assert log_dataset_arg.operation_path == "/path/to/value.csv" assert log_dataset_arg.operation_type == DbndDatasetOperationType.write assert log_dataset_arg.operation_status == DbndTargetOperationStatus.NOK assert log_dataset_arg.value_preview is not None assert log_dataset_arg.data_dimensions == (5, 3) assert set(log_dataset_arg.data_schema.as_dict().keys()) == { "columns", "dtypes", "shape", "size.bytes", "type", }
def test_log_dataset_with_row_and_column_count(self, mock_channel_tracker): @task() def task_with_log_datasets(): log_dataset_op( "location://path/to/value.csv", DbndDatasetOperationType.read, row_count=987, column_count=4, ) task_with_log_datasets() log_dataset_arg = one(get_log_datasets(mock_channel_tracker)) assert log_dataset_arg.operation_path == "location://path/to/value.csv" assert log_dataset_arg.operation_type == DbndDatasetOperationType.read assert log_dataset_arg.operation_status == DbndTargetOperationStatus.OK assert log_dataset_arg.value_preview == "" assert log_dataset_arg.data_dimensions == (987, 4) assert log_dataset_arg.data_schema is None
def test_path_with_data_meta(self, mock_channel_tracker, pandas_data_frame): @task() def task_with_log_datasets(): log_dataset_op( "/path/to/value.csv", DbndDatasetOperationType.read, data=pandas_data_frame, with_preview=True, with_schema=True, ) task_with_log_datasets() log_dataset_arg = one(get_log_datasets(mock_channel_tracker)) assert log_dataset_arg.operation_path == "/path/to/value.csv" assert log_dataset_arg.operation_type == DbndDatasetOperationType.read assert log_dataset_arg.operation_status == DbndTargetOperationStatus.OK assert log_dataset_arg.value_preview is not None assert log_dataset_arg.data_dimensions == (5, 3) assert set(log_dataset_arg.data_schema.as_dict().keys()) == { "columns", "dtypes", "shape", "size.bytes", "type", } log_metrics_args = get_log_metrics(mock_channel_tracker) metrics_names = { metric_row["metric"].key for metric_row in log_metrics_args } assert metrics_names.issuperset({ "path.to.value.csv.schema", "path.to.value.csv.shape0", "path.to.value.csv.shape1", "path.to.value.csv.rows", "path.to.value.csv.columns", "path.to.value.csv", })
def test_with_actual_op_path(self, mock_channel_tracker): @task() def task_with_log_datasets(): a_target = target("/path/to/value.csv") log_dataset_op(a_target, DbndDatasetOperationType.read, with_schema=False) task_with_log_datasets() log_dataset_arg = one(get_log_datasets(mock_channel_tracker)) assert log_dataset_arg.operation_path == "/path/to/value.csv" assert log_dataset_arg.operation_type == DbndDatasetOperationType.read assert log_dataset_arg.operation_status == DbndTargetOperationStatus.OK assert log_dataset_arg.value_preview == "" assert log_dataset_arg.data_dimensions is None assert log_dataset_arg.data_schema is None log_metrics_args = get_log_metrics(mock_channel_tracker) assert len(list(log_metrics_args)) == 0
def test_log_dataset(self, mock_channel_tracker): @task() def task_with_log_datasets(): log_dataset_op( "location://path/to/value.csv", DbndDatasetOperationType.read, with_schema=False, ) task_with_log_datasets() log_dataset_arg = one(get_log_datasets(mock_channel_tracker)) assert log_dataset_arg.operation_path == "location://path/to/value.csv" assert log_dataset_arg.operation_type == DbndDatasetOperationType.read assert log_dataset_arg.operation_status == DbndTargetOperationStatus.OK assert log_dataset_arg.value_preview == "" assert log_dataset_arg.data_dimensions is None assert log_dataset_arg.data_schema is None # no metrics reported log_metrics_args = list(get_log_metrics(mock_channel_tracker)) assert len(log_metrics_args) == 0
def test_failed_target(self, mock_channel_tracker): @task() def task_with_log_datasets(): log_dataset_op( "location://path/to/value.csv", "read", # Check passing str values too success=False, with_schema=False, ) task_with_log_datasets() log_dataset_arg = one(get_log_datasets(mock_channel_tracker)) assert log_dataset_arg.operation_path == "location://path/to/value.csv" assert log_dataset_arg.operation_type == DbndDatasetOperationType.read assert log_dataset_arg.operation_status == DbndTargetOperationStatus.NOK assert log_dataset_arg.value_preview == "" assert log_dataset_arg.data_dimensions is None assert log_dataset_arg.data_schema is None log_metrics_args = get_log_metrics(mock_channel_tracker) assert len(list(log_metrics_args)) == 0
def test_log_dataset_op_histograms_stats_flags(self, mock_channel_tracker, with_histograms, with_stats): # Test with_histograms/with_stats flag for pandas dataframe with open(THIS_DIR + "/nested_data.json", encoding="utf-8-sig") as f: nested_json = pd.json_normalize(json.load(f)) @task() def task_log_dataset_op_nested_json_data(): log_dataset_op( op_path="/my/path/to/nested_data.json", op_type=DbndDatasetOperationType.write, data=nested_json, with_histograms=with_histograms, with_stats=with_stats, ) task_log_dataset_op_nested_json_data() log_dataset_arg: LogDatasetArgs = one( get_log_datasets(mock_channel_tracker)) metrics_info = list(get_log_metrics(mock_channel_tracker)) histograms_metrics = list( filter(lambda m: m["metric"].key.endswith("histograms"), metrics_info)) if with_histograms and with_stats: assert histograms_metrics assert log_dataset_arg.columns_stats elif with_histograms: assert histograms_metrics assert not log_dataset_arg.columns_stats elif with_stats: assert not histograms_metrics assert log_dataset_arg.columns_stats else: assert not histograms_metrics assert not log_dataset_arg.columns_stats
def test_log_dataset_override_row_count(self, mock_channel_tracker, pandas_data_frame): @task() def task_with_log_dataset_wrapper(): with dataset_op_logger(op_path=target("/path/to/value.csv"), op_type="read", with_preview=True) as logger: logger.set(data=pandas_data_frame, row_count=999) task_with_log_dataset_wrapper() log_dataset_arg = one(get_log_datasets(mock_channel_tracker)) assert log_dataset_arg.operation_path == "/path/to/value.csv" assert log_dataset_arg.operation_type == DbndDatasetOperationType.read assert log_dataset_arg.operation_status == DbndTargetOperationStatus.OK assert log_dataset_arg.value_preview is not None assert log_dataset_arg.data_dimensions == (999, 3) assert set(log_dataset_arg.data_schema.as_dict().keys()) == { "columns", "dtypes", "shape", "size.bytes", "type", }