def _get_dataset_column_histogram(self, column_profile: DatasetFieldProfileClass, column: str) -> None: if self.config.include_field_histogram: self.dataset.set_config_value("interactive_evaluation", True) res = self.dataset.expect_column_kl_divergence_to_be_less_than( column, partition_object=None, threshold=None, result_format="COMPLETE", ).result if "details" in res and "observed_partition" in res["details"]: partition = res["details"]["observed_partition"] column_profile.histogram = HistogramClass( [str(v) for v in partition["bins"]], [ partition["tail_weights"][0], *partition["weights"], partition["tail_weights"][1], ], )
def _handle_convert_column_evrs( # noqa: C901 (complexity) self, profile: DatasetProfileClass, column: str, col_evrs: Iterable[ExpectationValidationResult], pretty_name: str, send_sample_values: bool, ) -> None: # TRICKY: This method mutates the profile directly. column_profile = DatasetFieldProfileClass(fieldPath=column) profile.fieldProfiles = profile.fieldProfiles or [] profile.fieldProfiles.append(column_profile) for evr in col_evrs: exp: str = evr.expectation_config.expectation_type res: dict = evr.result if not res: self.report.report_warning(f"profile of {pretty_name}", f"{exp} did not yield any results") continue if exp == "expect_column_unique_value_count_to_be_between": column_profile.uniqueCount = res["observed_value"] elif exp == "expect_column_proportion_of_unique_values_to_be_between": column_profile.uniqueProportion = res["observed_value"] elif exp == "expect_column_values_to_not_be_null": column_profile.nullCount = res["unexpected_count"] if ("unexpected_percent" in res and res["unexpected_percent"] is not None): column_profile.nullProportion = res[ "unexpected_percent"] / 100 elif exp == "expect_column_values_to_not_match_regex": # ignore; generally used for whitespace checks using regex r"^\s+|\s+$" pass elif exp == "expect_column_mean_to_be_between": column_profile.mean = str(res["observed_value"]) elif exp == "expect_column_min_to_be_between": column_profile.min = str(res["observed_value"]) elif exp == "expect_column_max_to_be_between": column_profile.max = str(res["observed_value"]) elif exp == "expect_column_median_to_be_between": column_profile.median = str(res["observed_value"]) elif exp == "expect_column_stdev_to_be_between": column_profile.stdev = str(res["observed_value"]) elif exp == "expect_column_quantile_values_to_be_between": if "observed_value" in res: column_profile.quantiles = [ QuantileClass(quantile=str(quantile), value=str(value)) for quantile, value in zip( res["observed_value"]["quantiles"], res["observed_value"]["values"], ) ] elif exp == "expect_column_values_to_be_in_set": column_profile.sampleValues = [ str(v) for v in res["partial_unexpected_list"] ] if not send_sample_values: column_profile.sampleValues = [] elif exp == "expect_column_kl_divergence_to_be_less_than": if "details" in res and "observed_partition" in res["details"]: partition = res["details"]["observed_partition"] column_profile.histogram = HistogramClass( [str(v) for v in partition["bins"]], [ partition["tail_weights"][0], *partition["weights"], partition["tail_weights"][1], ], ) elif exp == "expect_column_distinct_values_to_be_in_set": if "details" in res and "value_counts" in res["details"]: # This can be used to produce a bar chart since it includes values and frequencies. # As such, it is handled differently from expect_column_values_to_be_in_set, which # is nonexhaustive. column_profile.distinctValueFrequencies = [ ValueFrequencyClass(value=str(value), frequency=count) for value, count in res["details"] ["value_counts"].items() ] if not send_sample_values: column_profile.distinctValueFrequencies = [] elif exp == "expect_column_values_to_be_in_type_list": # ignore; we already know the types for each column via ingestion pass elif exp == "expect_column_values_to_be_unique": # ignore; this is generally covered by the unique value count test pass else: self.report.report_warning( f"profile of {pretty_name}", f"warning: unknown column mapper {exp} in col {column}", )