def test_failed_target_with_wrapper(self, mock_channel_tracker,
                                        pandas_data_frame):
        @task()
        def task_with_log_dataset_wrapper():
            with dataset_op_logger(
                    op_path=target("/path/to/value.csv"),
                    data=pandas_data_frame,
                    op_type="write",
                    with_preview=True,
            ) as logger:
                ans = 42
                ans / 0

        try:
            task_with_log_dataset_wrapper()
        except Exception:
            pass

        log_dataset_arg = one(get_log_datasets(mock_channel_tracker))
        assert log_dataset_arg.operation_path == "/path/to/value.csv"
        assert log_dataset_arg.operation_type == DbndDatasetOperationType.write
        assert log_dataset_arg.operation_status == DbndTargetOperationStatus.NOK
        assert log_dataset_arg.value_preview is not None
        assert log_dataset_arg.data_dimensions == (5, 3)
        assert set(log_dataset_arg.data_schema.as_dict().keys()) == {
            "columns",
            "dtypes",
            "shape",
            "size.bytes",
            "type",
        }
    def test_log_dataset_with_row_and_column_count(self, mock_channel_tracker):
        @task()
        def task_with_log_datasets():
            log_dataset_op(
                "location://path/to/value.csv",
                DbndDatasetOperationType.read,
                row_count=987,
                column_count=4,
            )

        task_with_log_datasets()

        log_dataset_arg = one(get_log_datasets(mock_channel_tracker))
        assert log_dataset_arg.operation_path == "location://path/to/value.csv"
        assert log_dataset_arg.operation_type == DbndDatasetOperationType.read
        assert log_dataset_arg.operation_status == DbndTargetOperationStatus.OK
        assert log_dataset_arg.value_preview == ""
        assert log_dataset_arg.data_dimensions == (987, 4)
        assert log_dataset_arg.data_schema is None
    def test_path_with_data_meta(self, mock_channel_tracker,
                                 pandas_data_frame):
        @task()
        def task_with_log_datasets():
            log_dataset_op(
                "/path/to/value.csv",
                DbndDatasetOperationType.read,
                data=pandas_data_frame,
                with_preview=True,
                with_schema=True,
            )

        task_with_log_datasets()

        log_dataset_arg = one(get_log_datasets(mock_channel_tracker))

        assert log_dataset_arg.operation_path == "/path/to/value.csv"
        assert log_dataset_arg.operation_type == DbndDatasetOperationType.read
        assert log_dataset_arg.operation_status == DbndTargetOperationStatus.OK
        assert log_dataset_arg.value_preview is not None
        assert log_dataset_arg.data_dimensions == (5, 3)
        assert set(log_dataset_arg.data_schema.as_dict().keys()) == {
            "columns",
            "dtypes",
            "shape",
            "size.bytes",
            "type",
        }

        log_metrics_args = get_log_metrics(mock_channel_tracker)
        metrics_names = {
            metric_row["metric"].key
            for metric_row in log_metrics_args
        }
        assert metrics_names.issuperset({
            "path.to.value.csv.schema",
            "path.to.value.csv.shape0",
            "path.to.value.csv.shape1",
            "path.to.value.csv.rows",
            "path.to.value.csv.columns",
            "path.to.value.csv",
        })
    def test_with_actual_op_path(self, mock_channel_tracker):
        @task()
        def task_with_log_datasets():
            a_target = target("/path/to/value.csv")
            log_dataset_op(a_target,
                           DbndDatasetOperationType.read,
                           with_schema=False)

        task_with_log_datasets()

        log_dataset_arg = one(get_log_datasets(mock_channel_tracker))
        assert log_dataset_arg.operation_path == "/path/to/value.csv"
        assert log_dataset_arg.operation_type == DbndDatasetOperationType.read
        assert log_dataset_arg.operation_status == DbndTargetOperationStatus.OK
        assert log_dataset_arg.value_preview == ""
        assert log_dataset_arg.data_dimensions is None
        assert log_dataset_arg.data_schema is None

        log_metrics_args = get_log_metrics(mock_channel_tracker)
        assert len(list(log_metrics_args)) == 0
    def test_log_dataset(self, mock_channel_tracker):
        @task()
        def task_with_log_datasets():
            log_dataset_op(
                "location://path/to/value.csv",
                DbndDatasetOperationType.read,
                with_schema=False,
            )

        task_with_log_datasets()

        log_dataset_arg = one(get_log_datasets(mock_channel_tracker))
        assert log_dataset_arg.operation_path == "location://path/to/value.csv"
        assert log_dataset_arg.operation_type == DbndDatasetOperationType.read
        assert log_dataset_arg.operation_status == DbndTargetOperationStatus.OK
        assert log_dataset_arg.value_preview == ""
        assert log_dataset_arg.data_dimensions is None
        assert log_dataset_arg.data_schema is None

        # no metrics reported
        log_metrics_args = list(get_log_metrics(mock_channel_tracker))
        assert len(log_metrics_args) == 0
    def test_failed_target(self, mock_channel_tracker):
        @task()
        def task_with_log_datasets():
            log_dataset_op(
                "location://path/to/value.csv",
                "read",  # Check passing str values too
                success=False,
                with_schema=False,
            )

        task_with_log_datasets()

        log_dataset_arg = one(get_log_datasets(mock_channel_tracker))
        assert log_dataset_arg.operation_path == "location://path/to/value.csv"
        assert log_dataset_arg.operation_type == DbndDatasetOperationType.read
        assert log_dataset_arg.operation_status == DbndTargetOperationStatus.NOK
        assert log_dataset_arg.value_preview == ""
        assert log_dataset_arg.data_dimensions is None
        assert log_dataset_arg.data_schema is None

        log_metrics_args = get_log_metrics(mock_channel_tracker)
        assert len(list(log_metrics_args)) == 0
示例#7
0
    def test_log_dataset_op_histograms_stats_flags(self, mock_channel_tracker,
                                                   with_histograms,
                                                   with_stats):
        # Test with_histograms/with_stats flag for pandas dataframe

        with open(THIS_DIR + "/nested_data.json", encoding="utf-8-sig") as f:
            nested_json = pd.json_normalize(json.load(f))

        @task()
        def task_log_dataset_op_nested_json_data():
            log_dataset_op(
                op_path="/my/path/to/nested_data.json",
                op_type=DbndDatasetOperationType.write,
                data=nested_json,
                with_histograms=with_histograms,
                with_stats=with_stats,
            )

        task_log_dataset_op_nested_json_data()

        log_dataset_arg: LogDatasetArgs = one(
            get_log_datasets(mock_channel_tracker))
        metrics_info = list(get_log_metrics(mock_channel_tracker))
        histograms_metrics = list(
            filter(lambda m: m["metric"].key.endswith("histograms"),
                   metrics_info))
        if with_histograms and with_stats:
            assert histograms_metrics
            assert log_dataset_arg.columns_stats
        elif with_histograms:
            assert histograms_metrics
            assert not log_dataset_arg.columns_stats
        elif with_stats:
            assert not histograms_metrics
            assert log_dataset_arg.columns_stats
        else:
            assert not histograms_metrics
            assert not log_dataset_arg.columns_stats
    def test_log_dataset_override_row_count(self, mock_channel_tracker,
                                            pandas_data_frame):
        @task()
        def task_with_log_dataset_wrapper():
            with dataset_op_logger(op_path=target("/path/to/value.csv"),
                                   op_type="read",
                                   with_preview=True) as logger:
                logger.set(data=pandas_data_frame, row_count=999)

        task_with_log_dataset_wrapper()

        log_dataset_arg = one(get_log_datasets(mock_channel_tracker))
        assert log_dataset_arg.operation_path == "/path/to/value.csv"
        assert log_dataset_arg.operation_type == DbndDatasetOperationType.read
        assert log_dataset_arg.operation_status == DbndTargetOperationStatus.OK
        assert log_dataset_arg.value_preview is not None
        assert log_dataset_arg.data_dimensions == (999, 3)
        assert set(log_dataset_arg.data_schema.as_dict().keys()) == {
            "columns",
            "dtypes",
            "shape",
            "size.bytes",
            "type",
        }