def _get_dataset_column_distinct_value_frequencies( self, column_profile: DatasetFieldProfileClass, column: str) -> None: if self.config.include_field_distinct_value_frequencies: column_profile.distinctValueFrequencies = [ ValueFrequencyClass(value=str(value), frequency=count) for value, count in self.dataset.get_column_value_counts( column).items() ]
def extract_table_profiles( self, analysis_metrics: DataFrame, ) -> None: self.profile.fieldProfiles = [] analysis_metrics = analysis_metrics.toPandas() # DataFrame with following columns: # entity: "Column" for column profile, "Table" for table profile # instance: name of column being profiled. "*" for table profiles # name: name of metric. Histogram metrics are formatted as "Histogram.<metric>.<value>" # value: value of metric column_metrics = analysis_metrics[analysis_metrics["entity"] == "Column"] # resolve histogram types for grouping column_metrics["kind"] = column_metrics["name"].apply( lambda x: "Histogram" if x.startswith("Histogram.") else x) column_histogram_metrics = column_metrics[column_metrics["kind"] == "Histogram"] column_nonhistogram_metrics = column_metrics[ column_metrics["kind"] != "Histogram"] histogram_columns = set() if len(column_histogram_metrics) > 0: # we only want the absolute counts for each histogram for now column_histogram_metrics = column_histogram_metrics[ column_histogram_metrics["name"].apply( lambda x: x.startswith("Histogram.abs."))] # get the histogram bins by chopping off the "Histogram.abs." prefix column_histogram_metrics["bin"] = column_histogram_metrics[ "name"].apply(lambda x: x[14:]) # reshape histogram counts for easier access histogram_counts = column_histogram_metrics.set_index( ["instance", "bin"])["value"] histogram_columns = set(histogram_counts.index.get_level_values(0)) profiled_columns = set() if len(column_nonhistogram_metrics) > 0: # reshape other metrics for easier access nonhistogram_metrics = column_nonhistogram_metrics.set_index( ["instance", "name"])["value"] profiled_columns = set( nonhistogram_metrics.index.get_level_values(0)) # histogram_columns = set(histogram_counts.index.get_level_values(0)) for column_spec in self.column_specs: column = column_spec.column column_profile = column_spec.column_profile if column not in profiled_columns: continue # convert to Dict so we can use .get deequ_column_profile = nonhistogram_metrics.loc[column].to_dict() # uniqueCount, uniqueProportion, nullCount, nullProportion, sampleValues already set in TableWrapper column_profile.min = null_str(deequ_column_profile.get("Minimum")) column_profile.max = null_str(deequ_column_profile.get("Maximum")) column_profile.mean = null_str(deequ_column_profile.get("Mean")) column_profile.median = null_str( deequ_column_profile.get("ApproxQuantiles-0.5")) column_profile.stdev = null_str( deequ_column_profile.get("StandardDeviation")) if all( deequ_column_profile.get(f"ApproxQuantiles-{quantile}") is not None for quantile in QUANTILES): column_profile.quantiles = [ QuantileClass( quantile=str(quantile), value=str( deequ_column_profile[f"ApproxQuantiles-{quantile}"] ), ) for quantile in QUANTILES ] if column in histogram_columns: column_histogram = histogram_counts.loc[column] # sort so output is deterministic column_histogram = column_histogram.sort_index() if column_spec.histogram_distinct: column_profile.distinctValueFrequencies = [ ValueFrequencyClass(value=value, frequency=int( column_histogram.loc[value])) for value in column_histogram.index ] # sort so output is deterministic column_profile.distinctValueFrequencies = sorted( column_profile.distinctValueFrequencies, key=lambda x: x.value) else: column_profile.histogram = HistogramClass( [str(x) for x in column_histogram.index], [float(x) for x in column_histogram], ) # append the column profile to the dataset profile self.profile.fieldProfiles.append(column_profile)
def _handle_convert_column_evrs( # noqa: C901 (complexity) self, profile: DatasetProfileClass, column: str, col_evrs: Iterable[ExpectationValidationResult], pretty_name: str, send_sample_values: bool, ) -> None: # TRICKY: This method mutates the profile directly. column_profile = DatasetFieldProfileClass(fieldPath=column) profile.fieldProfiles = profile.fieldProfiles or [] profile.fieldProfiles.append(column_profile) for evr in col_evrs: exp: str = evr.expectation_config.expectation_type res: dict = evr.result if not res: self.report.report_warning(f"profile of {pretty_name}", f"{exp} did not yield any results") continue if exp == "expect_column_unique_value_count_to_be_between": column_profile.uniqueCount = res["observed_value"] elif exp == "expect_column_proportion_of_unique_values_to_be_between": column_profile.uniqueProportion = res["observed_value"] elif exp == "expect_column_values_to_not_be_null": column_profile.nullCount = res["unexpected_count"] if ("unexpected_percent" in res and res["unexpected_percent"] is not None): column_profile.nullProportion = res[ "unexpected_percent"] / 100 elif exp == "expect_column_values_to_not_match_regex": # ignore; generally used for whitespace checks using regex r"^\s+|\s+$" pass elif exp == "expect_column_mean_to_be_between": column_profile.mean = str(res["observed_value"]) elif exp == "expect_column_min_to_be_between": column_profile.min = str(res["observed_value"]) elif exp == "expect_column_max_to_be_between": column_profile.max = str(res["observed_value"]) elif exp == "expect_column_median_to_be_between": column_profile.median = str(res["observed_value"]) elif exp == "expect_column_stdev_to_be_between": column_profile.stdev = str(res["observed_value"]) elif exp == "expect_column_quantile_values_to_be_between": if "observed_value" in res: column_profile.quantiles = [ QuantileClass(quantile=str(quantile), value=str(value)) for quantile, value in zip( res["observed_value"]["quantiles"], res["observed_value"]["values"], ) ] elif exp == "expect_column_values_to_be_in_set": column_profile.sampleValues = [ str(v) for v in res["partial_unexpected_list"] ] if not send_sample_values: column_profile.sampleValues = [] elif exp == "expect_column_kl_divergence_to_be_less_than": if "details" in res and "observed_partition" in res["details"]: partition = res["details"]["observed_partition"] column_profile.histogram = HistogramClass( [str(v) for v in partition["bins"]], [ partition["tail_weights"][0], *partition["weights"], partition["tail_weights"][1], ], ) elif exp == "expect_column_distinct_values_to_be_in_set": if "details" in res and "value_counts" in res["details"]: # This can be used to produce a bar chart since it includes values and frequencies. # As such, it is handled differently from expect_column_values_to_be_in_set, which # is nonexhaustive. column_profile.distinctValueFrequencies = [ ValueFrequencyClass(value=str(value), frequency=count) for value, count in res["details"] ["value_counts"].items() ] if not send_sample_values: column_profile.distinctValueFrequencies = [] elif exp == "expect_column_values_to_be_in_type_list": # ignore; we already know the types for each column via ingestion pass elif exp == "expect_column_values_to_be_unique": # ignore; this is generally covered by the unique value count test pass else: self.report.report_warning( f"profile of {pretty_name}", f"warning: unknown column mapper {exp} in col {column}", )