def log_parameter_data(self, parameter, target, value, operation_type, operation_status): # type: (TaskRunTracker, ParameterDefinition, Target, Any, DbndTargetOperationType, DbndTargetOperationStatus) -> None features_conf = self.settings.features if not features_conf.log_value_meta: return if value is None: return try: meta_conf = features_conf.get_value_meta_conf( parameter.value_meta_conf, value_type=parameter.value_type, target=target, ) key = "{}.{}".format(self.task_run.task.task_name, parameter.name) target.target_meta = get_value_meta_from_value( key, value, meta_conf) self.tracking_store.log_target( task_run=self.task_run, target=target, target_meta=target.target_meta, operation_type=operation_type, operation_status=operation_status, param_name=parameter.name, task_def_uid=parameter.task_definition_uid, ) except Exception as ex: log_exception( "Error occurred during target logging for %s" % (target, ), ex, non_critical=True, )
def test_histogram_others(self, meta_conf): strings = [] for i in range(1, 101): str_i = "str-{}".format(i) new_strings = [str_i] * i strings.extend(new_strings) strings_value = self.data_to_value([strings]) value_meta = get_value_meta_from_value("string_with_others", strings_value, meta_conf) histogram = value_meta.histograms["test_column_0"] assert len(histogram[0]) == 50 and len(histogram[1]) == 50 assert histogram[0][0] == 100 and histogram[1][0] == "str-100" assert histogram[0][10] == 90 and histogram[1][10] == "str-90" assert histogram[0][-2] == 52 and histogram[1][-2] == "str-52" assert histogram[0][-1] == sum(range( 1, 52)) and histogram[1][-1] == "_others" stats = value_meta.descriptive_stats["test_column_0"] assert stats["count"] == 5050 == sum(histogram[0]) assert stats["non-null"] == 5050 assert stats["null-count"] == 0 assert stats["distinct"] == 100 assert stats["type"] in ["str", "string"]
def log_data( self, key, # type: str data, # type: Union[pd.DataFrame, spark.DataFrame, PostgresTable, SnowflakeTable] meta_conf, # type: ValueMetaConf path=None, # type: Optional[Union[Target,str]] operation_type=DbndTargetOperationType. read, # type: DbndTargetOperationType operation_status=DbndTargetOperationStatus. OK, # type: DbndTargetOperationStatus raise_on_error=False, # type: bool ): # type: (...) -> None try: # Combine meta_conf with the config settings meta_conf = self.settings.tracking.get_value_meta_conf(meta_conf) value_meta = get_value_meta_from_value(key, data, meta_conf, raise_on_error) if not value_meta: logger.warning( "Couldn't log the wanted data {name}, reason - can't log objects of type {value_type} " .format(name=key, value_type=type(data))) return ts = utcnow() if path: self.tracking_store.log_target( task_run=self.task_run, target=path, target_meta=value_meta, operation_type=operation_type, operation_status=operation_status, param_name=key, ) metrics = value_meta.build_metrics_for_key(key, meta_conf) if metrics["user"]: self._log_metrics(metrics["user"]) if metrics["histograms"]: self.tracking_store.log_histograms(task_run=self.task_run, key=key, value_meta=value_meta, timestamp=ts) if not (metrics["user"] or metrics["histograms"] or path): logger.info("No metrics to log_data(key={}, data={})".format( key, data)) except Exception as ex: log_exception( "Error occurred during log_dataframe for %s" % (key, ), ex, non_critical=not raise_on_error, ) if raise_on_error: raise
def test_boolean_histogram(self, meta_conf, booleans_value): value_meta = get_value_meta_from_value("booleans", booleans_value, meta_conf) histogram = value_meta.histograms["test_column_0"] assert histogram[0] == [30, 20, 10] assert histogram[1] == [True, False, None] stats = value_meta.descriptive_stats["test_column_0"] assert stats["count"] == 60 assert stats["type"] in ["bool", "boolean"]
def test_complex_column(self, spark_session, meta_conf, numbers): # list is a complex value, and it can't have a histogram, # so we want to make sure we handle it correctly and nothing breaks complex_column = [(i, [str(i), str(i + 1)]) if i else [None] * 2 for i in numbers] complex_column = list(zip(*complex_column)) df = self.data_to_value(complex_column) value_meta = get_value_meta_from_value("complex", df, meta_conf) assert list(value_meta.histograms.keys()) == ["test_column_0"] assert list(value_meta.descriptive_stats.keys()) == ["test_column_0"] self.validate_numeric_histogram_and_stats(value_meta, "test_column_0")
def test_multiple_columns(self, meta_conf, numbers): values = [(i, float(i), str(i), str(i)) if i else [None] * 4 for i in numbers] values = list(zip(*values)) df = self.data_to_value(values) value_meta = get_value_meta_from_value("multi_column", df, meta_conf) self.validate_numeric_histogram_and_stats(value_meta, "test_column_0") self.validate_numeric_histogram_and_stats(value_meta, "test_column_1") str_histogram_1 = value_meta.histograms["test_column_2"] str_histogram_2 = value_meta.histograms["test_column_3"] assert str_histogram_1[0] == [4, 3, 2, 1] assert str_histogram_1[1] == ["1", "5", None, "3"] assert str_histogram_1 == str_histogram_2
def test_null_column(self, meta_conf, numbers_value): nulls = [None] * 20 df = self.data_to_value([nulls]) value_meta = get_value_meta_from_value("nulls", df, meta_conf) histogram = value_meta.histograms["test_column_0"] assert histogram[0] == [20] assert histogram[1] == [None] stats = value_meta.descriptive_stats["test_column_0"] assert stats["count"] == 20 assert stats["non-null"] == 0 assert stats["null-count"] == 20 assert stats["distinct"] == 1 assert stats["type"] == "object"
def test_strings_histogram(self, meta_conf, strings_value): value_meta = get_value_meta_from_value("strings", strings_value, meta_conf) histogram = value_meta.histograms["test_column_0"] assert histogram[0] == [30, 20, 15, 5] assert histogram[1] == [ "Ola Mundo!", "Shalom Olam!", "Hello World!", None ] stats = value_meta.descriptive_stats["test_column_0"] assert stats["count"] == 70 assert stats["non-null"] == 65 assert stats["null-count"] == 5 assert stats["distinct"] == 4 assert stats["type"] in ["str", "string"]
def log_data( self, key, # type: str data, # type: Union[pd.DataFrame, spark.DataFrame, PostgresTable, SnowflakeTable] meta_conf, # type: ValueMetaConf path=None, # type: Optional[Union[Target,str]] operation_type=DbndTargetOperationType. read, # type: DbndTargetOperationType operation_status=DbndTargetOperationStatus. OK, # type: DbndTargetOperationStatus ): # type: (...) -> None try: # Combine meta_conf with the config settings meta_conf = self.settings.features.get_value_meta_conf(meta_conf) value_meta = get_value_meta_from_value(key, data, meta_conf=meta_conf) if not value_meta: return ts = utcnow() if path: self.tracking_store.log_target( task_run=self.task_run, target=path, target_meta=value_meta, operation_type=operation_type, operation_status=operation_status, ) metrics = value_meta.build_metrics_for_key(key, meta_conf) if metrics["user"]: self._log_metrics(metrics["user"]) if metrics["histograms"]: self.tracking_store.log_histograms(task_run=self.task_run, key=key, value_meta=value_meta, timestamp=ts) except Exception as ex: log_exception( "Error occurred during log_dataframe for %s" % (key, ), ex, non_critical=True, )
def test_float_column(self, meta_conf, floats_value): value_meta = get_value_meta_from_value("floats", floats_value, meta_conf) self.validate_numeric_histogram_and_stats(value_meta, "test_column_0") return value_meta