Пример #1
0
def calc_meta_conf_for_value_type(tracking_level, value_type, target=None):
    # type: (ValueTrackingLevel, ValueType, Optional[Target]) -> ValueMetaConf
    """
    Calculating the right value log config base on the value type in order control the tracking of
    lazy evaluated types like spark dataframes

    IMPORTANT - The result is ValueMetaConf with restrictions only! this should be merged into a full ValueMetaConf.
    """

    if tracking_level == ValueTrackingLevel.ALL:
        # no restrictions
        return ValueMetaConf()

    if tracking_level == ValueTrackingLevel.SMART:

        result = ValueMetaConf()
        if value_type.is_lazy_evaluated:
            # restrict only for lazy evaluate values
            result = ValueMetaConf.disabled_expensive()

        elif target is not None and not value_type.support_fast_count(target):
            # we don't set it to True cause there might
            # be different configuration that will want it to be False
            result = attr.evolve(result, log_size=False)

        return result

    if tracking_level == ValueTrackingLevel.NONE:
        # restrict any
        return ValueMetaConf.disabled_expensive()
Пример #2
0
    def test_task_metrics_histograms(self, tmpdir, pandas_data_frame):
        metrics_folder = target(str(tmpdir))

        task_run = Mock()
        task_run.meta_files = TaskRunMetaFiles(metrics_folder)
        t = FileTrackingStore()
        tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t)
        tr_tracker.settings.tracking.get_value_meta_conf = Mock(
            return_value=ValueMetaConf.enabled()
        )
        tr_tracker.log_data("df", pandas_data_frame, meta_conf=ValueMetaConf.enabled())

        hist_metrics = TaskRunMetricsFileStoreReader(
            metrics_folder
        ).get_all_metrics_values(MetricSource.histograms)

        expected_preview = (
            "   Names  Births  Married\n"
            "     Bob     968     True\n"
            " Jessica     155    False\n"
            "    Mary      77     True\n"
            "    John     578    False\n"
            "     Mel     973     True"
        )

        # std value varies in different py versions due to float precision fluctuation
        df_births_std = hist_metrics["df.Births.std"]
        assert df_births_std == pytest.approx(428.4246)
Пример #3
0
    def test_task_metrics_simple(self, tmpdir, pandas_data_frame):
        metrics_folder = target(str(tmpdir))

        task_run = Mock()
        task_run.meta_files = TaskRunMetaFiles(metrics_folder)
        t = FileTrackingStore()
        tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t)
        tr_tracker.settings.features.get_value_meta_conf = Mock(
            return_value=ValueMetaConf.enabled())
        tr_tracker.log_metric("a", 1)
        tr_tracker.log_metric("a_string", "1")
        tr_tracker.log_metric("a_list", [1, 3])
        tr_tracker.log_metric("a_tuple", (1, 2))
        tr_tracker.log_dataframe("df",
                                 pandas_data_frame,
                                 meta_conf=ValueMetaConf.enabled())

        actual = TaskRunMetricsFileStoreReader(
            metrics_folder).get_all_metrics_values()

        print(actual)
        assert "df.schema" in actual
        del actual["df.schema"]
        assert actual == {
            "a": 1.0,
            "a_list": "[1, 3]",
            "a_string": 1.0,
            "a_tuple": "(1, 2)",
            "df.preview": "Names  Births",
            "df.shape": "(5, 2)",
            "df.shape_0_": 5.0,
            "df.shape_1_": 2.0,
        }
Пример #4
0
class TestValueMetaConf(object):
    @pytest.mark.parametrize(
        "left, right, expected",
        [
            (ALL_NONE, ALL_TRUE, ALL_TRUE),
            (ALL_TRUE, ALL_NONE, ALL_TRUE),
            (
                ValueMetaConf(
                    log_schema=True, log_size=False, log_preview=True, log_stats=True
                ),
                ALL_FALSE,
                ValueMetaConf(
                    log_schema=True,
                    log_size=False,
                    log_preview=True,
                    log_stats=True,
                    log_histograms=False,
                ),
            ),
            (ALL_FALSE, ALL_TRUE, ALL_FALSE),
            (ALL_TRUE, ALL_FALSE, ALL_TRUE),
        ],
    )
    def test_merging_2(self, left, right, expected):
        assert left.merge_if_none(right) == expected

    @pytest.mark.parametrize(
        "meta_conf_list, expected",
        [
            ([ALL_NONE, ALL_TRUE, ALL_FALSE], ALL_TRUE),
            ([ALL_NONE, ALL_NONE, ALL_FALSE], ALL_FALSE),
            (
                [
                    ALL_NONE,
                    ValueMetaConf(
                        log_preview=True,
                        log_schema=True,
                        log_size=True,
                        log_stats=False,
                    ),
                    ALL_FALSE,
                ],
                ValueMetaConf(
                    log_preview=True,
                    log_schema=True,
                    log_size=True,
                    log_stats=False,
                    log_histograms=False,
                ),
            ),
        ],
    )
    def test_summing(self, meta_conf_list, expected):
        assert reduce(lambda x, y: x.merge_if_none(y), meta_conf_list) == expected
Пример #5
0
    def test_spark_df_value_meta(self, spark_data_frame):
        expected_data_schema = {
            "type": SparkDataFrameValueType.type_str,
            "columns": list(spark_data_frame.schema.names),
            "size":
            int(spark_data_frame.count() * len(spark_data_frame.columns)),
            "shape": (spark_data_frame.count(), len(spark_data_frame.columns)),
            "dtypes":
            {f.name: str(f.dataType)
             for f in spark_data_frame.schema.fields},
        }

        meta_conf = ValueMetaConf.enabled()
        expected_value_meta = ValueMeta(
            value_preview=SparkDataFrameValueType().to_preview(
                spark_data_frame, meta_conf.get_preview_size()),
            data_dimensions=(spark_data_frame.count(),
                             len(spark_data_frame.columns)),
            data_schema=expected_data_schema,
            data_hash=None,
        )

        df_value_meta = SparkDataFrameValueType().get_value_meta(
            spark_data_frame)

        assert df_value_meta.value_preview == expected_value_meta.value_preview
        assert df_value_meta.data_hash == expected_value_meta.data_hash
        assert df_value_meta.data_schema == expected_value_meta.data_schema
        assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions
        assert df_value_meta == expected_value_meta
Пример #6
0
    def test_task_metrics_simple(self, tmpdir, pandas_data_frame):
        metrics_folder = target(str(tmpdir))

        task_run = Mock()
        task_run.meta_files = TaskRunMetaFiles(metrics_folder)
        t = FileTrackingStore()
        tr_tracker = TaskRunTracker(task_run=task_run, tracking_store=t)
        tr_tracker.settings.features.get_value_meta_conf = Mock(
            return_value=ValueMetaConf.enabled()
        )
        tr_tracker.log_metric("a", 1)
        tr_tracker.log_metric("a_string", "1")
        tr_tracker.log_metric("a_list", [1, 3])
        tr_tracker.log_metric("a_tuple", (1, 2))

        user_metrics = TaskRunMetricsFileStoreReader(
            metrics_folder
        ).get_all_metrics_values(MetricSource.user)

        assert user_metrics == {
            "a": 1.0,
            "a_list": "[1, 3]",
            "a_string": 1.0,
            "a_tuple": "(1, 2)",
        }
Пример #7
0
    def test_get_value_meta_empty(self, snowflake_table):
        # Arrange
        with mock.patch(
                "dbnd_snowflake.snowflake_values.SnowflakeController",
                new_callable=snowflake_controller_mock,
        ) as snowflake:
            # Act
            value_meta = SnowflakeTableValueType().get_value_meta(
                snowflake_table,
                meta_conf=(ValueMetaConf(log_preview=False,
                                         log_schema=False,
                                         log_size=False)),
            )

        # Assert
        assert value_meta.value_preview is None
        assert value_meta.data_dimensions is None
        assert value_meta.data_schema == {}
        assert (
            value_meta.data_hash ==
            "snowflake://*****:*****@SNOWFLAKE_ACCOUNT/SNOWFLAKE_SAMPLE_DATA.TPCDS_SF100TCL/CUSTOMER"
        )
        assert not snowflake.get_column_types.called
        assert not snowflake.get_dimensions.called
        assert not snowflake.to_preview.called
Пример #8
0
def _log_inputs(task_run):
    """
    For tracking mode. Logs InMemoryTarget inputs.
    """
    try:
        params = task_run.task._params
        for param in params.get_params(input_only=True):
            value = params.get_value(param.name)

            if isinstance(value, InMemoryTarget):
                try:
                    param = param.modify(value_meta_conf=ValueMetaConf(
                        log_preview=True,
                        log_schema=True,
                    ))

                    task_run.tracker.log_parameter_data(
                        parameter=param,
                        target=value,
                        value=value._obj,
                        operation_type=DbndTargetOperationType.read,
                        operation_status=DbndTargetOperationStatus.OK,
                    )
                except Exception as ex:
                    log_exception(
                        "Failed to log input param to tracking store.",
                        ex=ex,
                        non_critical=True,
                    )
    except Exception as ex:
        log_exception("Failed to log input params to tracking store.",
                      ex=ex,
                      non_critical=True)
Пример #9
0
def log_dataframe(key,
                  value,
                  with_preview=True,
                  with_size=True,
                  with_schema=True,
                  with_stats=False):
    # type: (str, Union[pd.DataFrame, spark.DataFrame], bool,bool, bool) -> None

    meta_conf = ValueMetaConf(
        log_preview=with_preview,
        log_schema=with_schema,
        log_size=with_size,
        log_stats=with_stats,
    )
    tracker = _get_tracker()
    if tracker:
        tracker.log_dataframe(key, value, meta_conf=meta_conf)
        return

    from dbnd._core.task_run.task_run_tracker import get_value_meta_for_metric

    value_type = get_value_meta_for_metric(key, value, meta_conf=meta_conf)
    if value_type:
        logger.info("Log DataFrame '{}': shape='{}'".format(
            key, value_type.data_dimensions))
    else:
        logger.info("Log DataFrame '{}': {} is not supported".format(
            key, type(value)))
Пример #10
0
def log_data(
    key,  # type: str
    value=None,  # type: Union[pd.DataFrame, spark.DataFrame, PostgresTable, SnowflakeTable]
    path=None,  # type: Optional[str]
    operation_type=DbndTargetOperationType.read,  # type: DbndTargetOperationType
    with_preview=None,  # type: Optional[bool]
    with_size=None,  # type: Optional[bool]
    with_schema=None,  # type: Optional[bool]
    with_stats=None,  # type: Optional[Union[bool, str, List[str], LogDataRequest]]
    with_histograms=None,  # type: Optional[Union[bool, str, List[str], LogDataRequest]]
    raise_on_error=False,  # type: bool
):  # type: (...) -> None
    tracker = _get_tracker()
    if not tracker:
        return

    meta_conf = ValueMetaConf(
        log_preview=with_preview,
        log_schema=with_schema,
        log_size=with_size,
        log_stats=with_stats,
        log_histograms=with_histograms,
    )

    tracker.log_data(
        key,
        value,
        meta_conf=meta_conf,
        path=path,
        operation_type=operation_type,
        raise_on_error=raise_on_error,
    )
Пример #11
0
    def test_df_value_meta(self, pandas_data_frame):
        expected_data_schema = {
            "type": DataFrameValueType.type_str,
            "columns": list(pandas_data_frame.columns),
            "size": int(pandas_data_frame.size),
            "shape": pandas_data_frame.shape,
            "dtypes": {
                col: str(type_)
                for col, type_ in pandas_data_frame.dtypes.items()
            },
        }

        meta_conf = ValueMetaConf.enabled()
        expected_value_meta = ValueMeta(
            value_preview=DataFrameValueType().to_preview(
                pandas_data_frame, preview_size=meta_conf.get_preview_size()),
            data_dimensions=pandas_data_frame.shape,
            data_schema=expected_data_schema,
            data_hash=fast_hasher.hash(
                hash_pandas_object(pandas_data_frame, index=True).values),
        )

        df_value_meta = DataFrameValueType().get_value_meta(
            pandas_data_frame, meta_conf=meta_conf)

        assert df_value_meta.value_preview == expected_value_meta.value_preview
        assert df_value_meta.data_hash == expected_value_meta.data_hash
        assert json_utils.dumps(df_value_meta.data_schema) == json_utils.dumps(
            expected_value_meta.data_schema)
        assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions
        assert df_value_meta == expected_value_meta
Пример #12
0
    def test_get_value_meta(self, snowflake_table):
        # Arrange
        with mock.patch(
                "dbnd_snowflake.snowflake_values.SnowflakeController",
                new_callable=snowflake_controller_mock,
        ) as snowflake:
            # Act
            value_meta = SnowflakeTableValueType().get_value_meta(
                snowflake_table, meta_conf=(ValueMetaConf.enabled()))

        # Assert
        assert value_meta.value_preview == "test preview"
        assert value_meta.data_dimensions == [42, 12]
        assert value_meta.data_schema == {
            "type": "SnowflakeTable",
            "column_types": {
                "name": "varchar"
            },
            "size": "500 B",
        }
        assert (
            value_meta.data_hash ==
            "snowflake://*****:*****@SNOWFLAKE_ACCOUNT/SNOWFLAKE_SAMPLE_DATA.TPCDS_SF100TCL/CUSTOMER"
        )
        assert snowflake.get_column_types.called
        assert snowflake.get_dimensions.called
        assert snowflake.to_preview.called
Пример #13
0
def _log_parameter_value(task_run, parameter_definition, target, value):
    # make sure it will be logged correctly
    parameter_definition = parameter_definition.modify(
        value_meta_conf=ValueMetaConf(
            log_preview=True,
            log_schema=True,
        ))

    try:
        # case what if result is Proxy
        value_type = get_value_type_of_obj(value,
                                           parameter_definition.value_type)
        task_run.run.target_origin.add(target, value, value_type)
    except Exception as ex:
        log_exception("Failed to register result to target tracking.",
                      ex=ex,
                      non_critical=True)

    try:
        task_run.tracker.log_parameter_data(
            parameter=
            parameter_definition,  # was: task_run.task.task_definition.task_class.result,
            target=target,
            value=value,
            operation_type=DbndTargetOperationType.
            write,  # is it write? (or log?)
            operation_status=DbndTargetOperationStatus.OK,
        )
    except Exception as ex:
        log_exception("Failed to log result to tracking store.",
                      ex=ex,
                      non_critical=True)
Пример #14
0
 def meta_conf(self):
     return ValueMetaConf(
         log_preview=self.with_preview,
         log_schema=self.with_schema,
         log_size=self.with_schema,
         log_stats=self.with_stats,
         log_histograms=self.with_histograms,
     )
Пример #15
0
 def test_str_value_meta(self):
     str_value_meta = StrValueType().get_value_meta("foo", ValueMetaConf.enabled())
     expected_value_meta = ValueMeta(
         value_preview="foo",
         data_dimensions=None,
         data_schema={"type": "str"},
         data_hash=fast_hasher.hash("foo"),
     )
     assert str_value_meta == expected_value_meta
Пример #16
0
    def _build_parameter(self, context="inline"):
        s = self.parameter  # type: ParameterDefinition
        update_kwargs = {}

        value_type = self._build_value_type(context)

        validator = s.validator
        if s.choices:
            validator = ChoiceValidator(s.choices)

        if is_not_defined(s.default):
            if s.empty_default:
                update_kwargs["default"] = value_type._generate_empty_default()

        if not is_defined(s.load_on_build):
            update_kwargs["load_on_build"] = value_type.load_on_build

        # create value meta
        if s.value_meta_conf is None:
            update_kwargs["value_meta_conf"] = ValueMetaConf(
                log_preview=s.log_preview,
                log_preview_size=s.log_preview_size,
                log_schema=s.log_schema,
                log_size=s.log_size,
                log_stats=s.log_stats,
                log_histograms=s.log_histograms,
            )

        # Whether different values for this parameter will differentiate otherwise equal tasks
        description = s.description or ""
        if not is_defined(description):
            if s.is_output() and s.default_output_description:
                description = s.default_output_description
            elif not s.load_on_build and s.default_input_description:
                description = s.default_input_description
            else:
                description = s.default_description

            if s.validator:
                description = _add_description(description, validator.description)
            update_kwargs["description"] = description()
        # We need to keep track of this to get the order right (see Task class)
        ParameterDefinition._total_counter += 1
        if s.kind == _ParameterKind.task_output:
            update_kwargs["significant"] = False

        updated = self.modify(
            value_type=value_type,
            value_type_defined=value_type,
            validator=validator,
            description=description,
            parameter_id=ParameterDefinition._total_counter,
            **update_kwargs
        )

        return updated.parameter
    def test_spark_df_value_meta(self, spark_data_frame,
                                 spark_data_frame_histograms,
                                 spark_data_frame_stats):
        expected_data_schema = {
            "type":
            SparkDataFrameValueType.type_str,
            "columns":
            list(spark_data_frame.schema.names),
            "size.bytes":
            int(spark_data_frame.count() * len(spark_data_frame.columns)),
            "shape": (spark_data_frame.count(), len(spark_data_frame.columns)),
            "dtypes":
            {f.name: str(f.dataType)
             for f in spark_data_frame.schema.fields},
        }

        expected_hist_sys_metrics = {
            "boolean_histograms_and_stats_calc_time",
            "histograms_and_stats_calc_time",
            "numeric_histograms_and_stats_calc_time",
            "string_histograms_and_stats_calc_time",
        }

        meta_conf = ValueMetaConf.enabled()
        expected_value_meta = ValueMeta(
            value_preview=SparkDataFrameValueType().to_preview(
                spark_data_frame, meta_conf.get_preview_size()),
            data_dimensions=(spark_data_frame.count(),
                             len(spark_data_frame.columns)),
            data_hash=SparkDataFrameValueType().to_signature(spark_data_frame),
            data_schema=expected_data_schema,
            descriptive_stats=spark_data_frame_stats,
            histograms=spark_data_frame_histograms,
        )

        df_value_meta = SparkDataFrameValueType().get_value_meta(
            spark_data_frame, meta_conf)

        assert df_value_meta.value_preview == expected_value_meta.value_preview
        assert df_value_meta.data_hash == expected_value_meta.data_hash
        assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions
        assert df_value_meta.data_schema == expected_value_meta.data_schema
        # it changes all the time, it has different formats, and it's already tested in histogram tests
        # assert df_value_meta.descriptive_stats == expected_value_meta.descriptive_stats

        # histogram_system_metrics values are too dynamic, so checking only keys, but not values
        assert (set(df_value_meta.histogram_system_metrics.keys()) ==
                expected_hist_sys_metrics)
        df_value_meta.histogram_system_metrics = None

        # assert df_value_meta.histograms == expected_value_meta.histograms
        # assert attr.asdict(df_value_meta) == attr.asdict(expected_value_meta)

        pandas_data_frame = spark_data_frame.toPandas()
        pandas_value_meta = DataFrameValueType().get_value_meta(
            pandas_data_frame, meta_conf)
Пример #18
0
    def test_df_value_meta(
        self, pandas_data_frame, pandas_data_frame_histograms, pandas_data_frame_stats
    ):
        expected_data_schema = {
            "type": DataFrameValueType.type_str,
            "columns": list(pandas_data_frame.columns),
            "size": int(pandas_data_frame.size),
            "shape": pandas_data_frame.shape,
            "dtypes": {
                col: str(type_) for col, type_ in pandas_data_frame.dtypes.items()
            },
        }

        meta_conf = ValueMetaConf.enabled()
        expected_value_meta = ValueMeta(
            value_preview=DataFrameValueType().to_preview(
                pandas_data_frame, preview_size=meta_conf.get_preview_size()
            ),
            data_dimensions=pandas_data_frame.shape,
            data_schema=expected_data_schema,
            data_hash=fast_hasher.hash(
                hash_pandas_object(pandas_data_frame, index=True).values
            ),
            descriptive_stats=pandas_data_frame_stats,
            histograms=pandas_data_frame_histograms,
        )

        df_value_meta = DataFrameValueType().get_value_meta(
            pandas_data_frame, meta_conf=meta_conf
        )

        assert df_value_meta.value_preview == expected_value_meta.value_preview
        assert df_value_meta.data_hash == expected_value_meta.data_hash
        assert json_utils.dumps(df_value_meta.data_schema) == json_utils.dumps(
            expected_value_meta.data_schema
        )
        assert df_value_meta.data_dimensions == expected_value_meta.data_dimensions

        std = df_value_meta.descriptive_stats["Births"].pop("std")
        expected_std = expected_value_meta.descriptive_stats["Births"].pop("std")
        assert round(std, 2) == expected_std
        df_value_meta.descriptive_stats["Names"].pop("top")
        assert df_value_meta.descriptive_stats == expected_value_meta.descriptive_stats

        counts, values = df_value_meta.histograms.pop("Names")
        expected_counts, expected_values = expected_value_meta.histograms.pop("Names")
        assert counts == expected_counts
        assert set(values) == set(expected_values)  # order changes in each run
        # histograms are tested in histogram tests and they change a lot, no need to test also here
        df_value_meta.histograms = expected_value_meta.histograms = None

        expected_value_meta.histogram_system_metrics = (
            df_value_meta.histogram_system_metrics
        )
        assert df_value_meta.data_schema == expected_value_meta.data_schema
        assert attr.asdict(df_value_meta) == attr.asdict(expected_value_meta)
Пример #19
0
    def test_task_run_sync_local_multi_target(
        self,
        monkeypatch,
        my_multitarget,
        test_task,
        create_local_multitarget,
        mock_fs_download,
        mock_file_metadata_registry,
        mock_target_move_from,
    ):
        test_task = test_task.t(my_multitarget)
        task_run = test_task.dbnd_run().root_task_run
        sync_local = task_run.sync_local

        assert len(sync_local.inputs_to_sync) == 1
        task_param, old_multitarget = sync_local.inputs_to_sync[0]
        task_param.value_meta_conf = ValueMetaConf(
            log_preview=True,
            log_preview_size=10000,
            log_schema=True,
            log_size=True,
            log_stats=LogDataRequest(
                include_all_boolean=True,
                include_all_numeric=True,
                include_all_string=True,
            ),
            log_histograms=LogDataRequest(),
        )

        assert task_param == test_task._params.get_param("input_")
        assert old_multitarget == my_multitarget

        local_multitarget = create_local_multitarget()

        with mock_fs_download as mocked_fs_download, mock_file_metadata_registry, mock_target_move_from as mock_target_move_from:
            monkeypatch.setattr(FileTarget, "tmp", mock_tmp)

            # only pre_execute is checked because post_execute code is unreachable for MultiTargets
            sync_local.sync_pre_execute()
            assert mocked_fs_download.call_count == 2
            mocked_fs_download.assert_has_calls([
                call(remote_subtarget.path, TMP_FILE_PATH)
                for remote_subtarget, local_subtarget in zip(
                    my_multitarget.targets, local_multitarget.targets)
            ])
        # check if test_task.input_ was changed to local after sync_pre_execute
        self.compare_multitargets(test_task.input_, local_multitarget)

        sync_local.sync_post_execute()
        # check if test_task.input_ was set back to original target
        self.compare_multitargets(test_task.input_, my_multitarget)
Пример #20
0
    def test_get_value_meta_preview_small_size(self, value, value_type, target,
                                               expected_value_preview):
        tracking_config = TrackingConfig.from_databand_context()
        tracking_config.value_reporting_strategy = ValueTrackingLevel.ALL

        result = get_value_meta(
            value,
            ValueMetaConf(),
            tracking_config,
            value_type=value_type,
            target=target,
        )

        assert result.value_preview == expected_value_preview
Пример #21
0
    def test_target_value_meta(self):
        v = target("a")
        meta_conf = ValueMetaConf.enabled()
        target_value_meta = TargetPathLibValueType().get_value_meta(
            v, meta_conf=meta_conf)

        expected_value_meta = ValueMeta(
            value_preview='"a"',
            data_dimensions=None,
            data_schema={"type": "Path"},
            data_hash=fast_hasher.hash(v),
        )

        assert target_value_meta == expected_value_meta
Пример #22
0
 def _build_meta_conf(self):
     # type: () -> ValueMetaConf
     """
     Translate this configuration into value meta conf
     WE EXPECT IT TO HAVE ALL THE INNER VALUES SET WITHOUT NONES
     """
     return ValueMetaConf(
         log_schema=self.log_value_schema,
         log_size=self.log_value_size,
         log_preview_size=self.log_value_preview_max_len,
         log_preview=self.log_value_preview,
         log_stats=self.log_value_stats,
         log_histograms=self.log_histograms,
     )
Пример #23
0
def log_data(
        key,  # type: str
        value=None,  # type: Union[pd.DataFrame, spark.DataFrame, PostgresTable, SnowflakeTable]
        path=None,  # type: Optional[str]
        operation_type=DbndTargetOperationType.
    read,  # type: DbndTargetOperationType
        with_preview=None,  # type: Optional[bool]
        with_size=None,  # type: Optional[bool]
        with_schema=None,  # type: Optional[bool]
        with_stats=None,  # type: Optional[Union[bool, str, List[str], LogDataRequest]]
        with_histograms=None,  # type: Optional[Union[bool, str, List[str], LogDataRequest]]
        raise_on_error=False,  # type: bool
):  # type: (...) -> None
    """
    Log data information to dbnd.

    @param key: Name of the data.
    @param value: Value of the data, currently supporting only dataframes and tables view.
    @param path: Optional target or path representing a target to connect the data to.
    @param operation_type: Type of the operation doing with the target - reading or writing the data?
    @param with_preview: True if should log a preview of the data.
    @param with_size: True if should log the size of the data.
    @param with_schema: True if should log the schema of the data.
    @param with_stats: True if should calculate and log stats of the data.
    @param with_histograms: True if should calculate and log histogram of the data.
    @param raise_on_error: raise if error occur.
    """
    tracker = _get_tracker()
    if not tracker:
        message = TRACKER_MISSING_MESSAGE % ("log_data", )
        get_one_time_logger().log_once(message, "log_data", logging.WARNING)
        return

    meta_conf = ValueMetaConf(
        log_preview=with_preview,
        log_schema=with_schema,
        log_size=with_size,
        log_stats=with_stats,
        log_histograms=with_histograms,
    )

    tracker.log_data(
        key,
        value,
        meta_conf=meta_conf,
        path=path,
        operation_type=operation_type,
        raise_on_error=raise_on_error,
    )
Пример #24
0
 def log_dataframe(
     self,
     key,
     df,
     with_preview=True,
     with_schema=True,
     with_size=True,
     with_stats=False,
 ):
     meta_conf = ValueMetaConf(
         log_preview=with_preview,
         log_schema=with_schema,
         log_size=with_size,
         log_stats=with_stats,
     )
     self.tracker.log_data(key, df, meta_conf=meta_conf)
Пример #25
0
    def test_get_value_meta_empty(self, snowflake_table):
        value_meta = SnowflakeTableValueType().get_value_meta(
            snowflake_table,
            meta_conf=(ValueMetaConf(log_preview=False,
                                     log_schema=False,
                                     log_size=False)),
        )

        # Assert
        assert value_meta.value_preview is None
        assert value_meta.data_dimensions is None
        assert value_meta.data_schema == {}
        assert value_meta.data_hash == EXPECTED_SNOWFLAKE_TABLE_SIGNATURE
        assert not snowflake_table.snowflake_ctrl.get_column_types.called
        assert not snowflake_table.snowflake_ctrl.get_dimensions.called
        assert not snowflake_table.snowflake_ctrl.to_preview.called
Пример #26
0
    def test_log_schema(
        self, tracking_config, param_log_schema, config_log_schema,
        expected_log_schema
    ):  # type: (Callable[[], TrackingConfig], bool, bool, bool) -> None
        # Arrange
        tracking_config = tracking_config()
        param_mc = ValueMetaConf(log_schema=param_log_schema)
        if config_log_schema is not None:
            tracking_config.log_value_schema = config_log_schema

        # Act
        actual_value_meta_conf = tracking_config.get_value_meta_conf(
            param_mc, ObjectValueType())

        # Assert
        assert actual_value_meta_conf.log_schema == expected_log_schema
Пример #27
0
    def test_task_run_sync_local_file_target(
        self,
        monkeypatch,
        test_task,
        my_target,
        mock_fs_download,
        mock_file_metadata_registry,
        mock_target_move_from,
    ):
        test_task = test_task.t(my_target)
        task_run = test_task.dbnd_run().root_task_run
        sync_local = task_run.sync_local

        assert len(sync_local.inputs_to_sync) == 1

        task_param, old_target = sync_local.inputs_to_sync[0]
        task_param.value_meta_conf = ValueMetaConf(
            log_preview=True,
            log_preview_size=10000,
            log_schema=True,
            log_size=True,
            log_stats=LogDataRequest(
                include_all_boolean=True,
                include_all_numeric=True,
                include_all_string=True,
            ),
            log_histograms=LogDataRequest(),
        )

        assert task_param == test_task._params.get_param("input_")
        assert old_target == my_target
        local_target = target(
            os.path.join(DBND_LOCAL_ROOT, LOCAL_SYNC_CACHE_NAME),
            my_target.path.lstrip("/"),
        )
        with mock_fs_download as mocked_fs_download, mock_file_metadata_registry, mock_target_move_from as mock_target_move_from:
            monkeypatch.setattr(FileTarget, "tmp", mock_tmp)

            sync_local.sync_pre_execute()
            mocked_fs_download.assert_called_once_with(my_target.path,
                                                       TMP_FILE_PATH)

        assert test_task.input_ == local_target

        sync_local.sync_post_execute()

        assert test_task.input_ == my_target
Пример #28
0
    def test_get_histograms_and_stats(self):
        with mock.patch(
                "dbnd_postgres.postgres_values.PostgresController._query"
        ) as query_patch:
            # Arrange
            pg_stats_data = [{
                "attname": "customer",
                "null_frac": 0.5,
                "n_distinct": 8,
                "most_common_vals": "{customerA, customerB}",
                "most_common_freqs": [0.2, 0.2],
            }]
            pg_class_data = [{"reltuples": 10}]
            information_schema_columns_data = [{
                "column_name": "customer",
                "data_type": "varchar"
            }]
            query_patch.side_effect = [
                pg_stats_data,
                pg_class_data,
                information_schema_columns_data,
            ]

            expected_columns_stats = [
                ColumnStatsArgs(
                    column_name="customer",
                    column_type="varchar",
                    records_count=10,
                    distinct_count=8,
                    null_count=5,
                )
            ]
            expected_histograms = {
                "customer": ([2, 2, 1], ["customerA", "customerB", "_others"])
            }

            # Act
            postgres = PostgresController("user@database", "data_table")
            meta_conf = ValueMetaConf.enabled()
            columns_stats, histograms = postgres.get_histograms_and_stats(
                meta_conf)

            # Assert
            assert columns_stats == expected_columns_stats
            assert histograms == expected_histograms
Пример #29
0
    def test_get_value_meta(self, snowflake_table):
        value_meta = SnowflakeTableValueType().get_value_meta(
            snowflake_table, meta_conf=(ValueMetaConf.enabled()))

        # Assert
        assert value_meta.value_preview == "test preview"
        assert value_meta.data_dimensions == [42, 12]
        assert value_meta.data_schema == {
            "type": "SnowflakeTable",
            "column_types": {
                "name": "varchar"
            },
            "size.bytes": 500,
        }
        assert value_meta.data_hash == EXPECTED_SNOWFLAKE_TABLE_SIGNATURE
        assert snowflake_table.snowflake_ctrl.get_column_types.called
        assert snowflake_table.snowflake_ctrl.get_dimensions.called
        assert snowflake_table.snowflake_ctrl.to_preview.called
Пример #30
0
    def test_log_preview_size(
        self,
        tracking_config,
        param_log_preview_size,
        config_log_preview_size,
        expected_log_preview_size,
    ):  # type: (Callable[[], TrackingConfig], int, int, int) -> None
        # Arrange
        tc = tracking_config()
        param_mc = ValueMetaConf(log_preview_size=param_log_preview_size)
        if config_log_preview_size is not None:
            tc.log_value_preview_max_len = config_log_preview_size

        # Act
        actual_value_meta_conf = tc.get_value_meta_conf(param_mc)

        # Assert
        assert actual_value_meta_conf.log_preview_size == expected_log_preview_size