def generate_dataset_profile( # noqa: C901 (complexity) self, ) -> DatasetProfileClass: self.dataset.set_default_expectation_argument( "catch_exceptions", self.config.catch_exceptions) profile = DatasetProfileClass(timestampMillis=get_sys_time()) if self.partition: profile.partitionSpec = PartitionSpecClass( partition=self.partition) profile.fieldProfiles = [] self._get_dataset_rows(profile) all_columns = self.dataset.get_table_columns() profile.columnCount = len(all_columns) columns_to_profile = set(self._get_columns_to_profile()) logger.debug( f"profiling {self.dataset_name}: flushing stage 1 queries") self.query_combiner.flush() columns_profiling_queue: List[_SingleColumnSpec] = [] for column in all_columns: column_profile = DatasetFieldProfileClass(fieldPath=column) profile.fieldProfiles.append(column_profile) if column in columns_to_profile: column_spec = _SingleColumnSpec(column, column_profile) columns_profiling_queue.append(column_spec) self._get_column_type(column_spec, column) self._get_column_cardinality(column_spec, column) logger.debug( f"profiling {self.dataset_name}: flushing stage 2 queries") self.query_combiner.flush() assert profile.rowCount is not None row_count: int = profile.rowCount telemetry.telemetry_instance.ping( "profile_sql_table", # bucket by taking floor of log of the number of rows scanned { "rows_profiled": 10**int(log10(row_count + 1)), }, ) for column_spec in columns_profiling_queue: column = column_spec.column column_profile = column_spec.column_profile type_ = column_spec.type_ cardinality = column_spec.cardinality non_null_count = column_spec.nonnull_count unique_count = column_spec.unique_count if self.config.include_field_null_count and non_null_count is not None: null_count = row_count - non_null_count if null_count < 0: null_count = 0 column_profile.nullCount = null_count if row_count > 0: column_profile.nullProportion = null_count / row_count # Sometimes this value is bigger than 1 because of the approx queries if column_profile.nullProportion > 1: column_profile.nullProportion = 1 if unique_count is not None: column_profile.uniqueCount = unique_count if non_null_count is not None and non_null_count > 0: column_profile.uniqueProportion = unique_count / non_null_count # Sometimes this value is bigger than 1 because of the approx queries if column_profile.uniqueProportion > 1: column_profile.uniqueProportion = 1 self._get_dataset_column_sample_values(column_profile, column) if (type_ == ProfilerDataType.INT or type_ == ProfilerDataType.FLOAT or type_ == ProfilerDataType.NUMERIC): if cardinality == Cardinality.UNIQUE: pass elif cardinality in [ Cardinality.ONE, Cardinality.TWO, Cardinality.VERY_FEW, Cardinality.FEW, Cardinality.MANY, Cardinality.VERY_MANY, Cardinality.UNIQUE, ]: self._get_dataset_column_min(column_profile, column) self._get_dataset_column_max(column_profile, column) self._get_dataset_column_mean(column_profile, column) self._get_dataset_column_median(column_profile, column) if type_ == ProfilerDataType.INT: self._get_dataset_column_stdev(column_profile, column) self._get_dataset_column_quantiles(column_profile, column) self._get_dataset_column_histogram(column_profile, column) if cardinality in [ Cardinality.ONE, Cardinality.TWO, Cardinality.VERY_FEW, Cardinality.FEW, ]: self._get_dataset_column_distinct_value_frequencies( column_profile, column, ) else: # unknown cardinality - skip pass elif type_ == ProfilerDataType.STRING: if cardinality in [ Cardinality.ONE, Cardinality.TWO, Cardinality.VERY_FEW, Cardinality.FEW, ]: self._get_dataset_column_distinct_value_frequencies( column_profile, column, ) elif type_ == ProfilerDataType.DATETIME: self._get_dataset_column_min(column_profile, column) self._get_dataset_column_max(column_profile, column) # FIXME: Re-add histogram once kl_divergence has been modified to support datetimes if cardinality in [ Cardinality.ONE, Cardinality.TWO, Cardinality.VERY_FEW, Cardinality.FEW, ]: self._get_dataset_column_distinct_value_frequencies( column_profile, column, ) else: if cardinality in [ Cardinality.ONE, Cardinality.TWO, Cardinality.VERY_FEW, Cardinality.FEW, ]: self._get_dataset_column_distinct_value_frequencies( column_profile, column, ) logger.debug( f"profiling {self.dataset_name}: flushing stage 3 queries") self.query_combiner.flush() return profile
def _handle_convert_column_evrs( # noqa: C901 (complexity) self, profile: DatasetProfileClass, column: str, col_evrs: Iterable[ExpectationValidationResult], pretty_name: str, send_sample_values: bool, ) -> None: # TRICKY: This method mutates the profile directly. column_profile = DatasetFieldProfileClass(fieldPath=column) profile.fieldProfiles = profile.fieldProfiles or [] profile.fieldProfiles.append(column_profile) for evr in col_evrs: exp: str = evr.expectation_config.expectation_type res: dict = evr.result if not res: self.report.report_warning(f"profile of {pretty_name}", f"{exp} did not yield any results") continue if exp == "expect_column_unique_value_count_to_be_between": column_profile.uniqueCount = res["observed_value"] elif exp == "expect_column_proportion_of_unique_values_to_be_between": column_profile.uniqueProportion = res["observed_value"] elif exp == "expect_column_values_to_not_be_null": column_profile.nullCount = res["unexpected_count"] if ("unexpected_percent" in res and res["unexpected_percent"] is not None): column_profile.nullProportion = res[ "unexpected_percent"] / 100 elif exp == "expect_column_values_to_not_match_regex": # ignore; generally used for whitespace checks using regex r"^\s+|\s+$" pass elif exp == "expect_column_mean_to_be_between": column_profile.mean = str(res["observed_value"]) elif exp == "expect_column_min_to_be_between": column_profile.min = str(res["observed_value"]) elif exp == "expect_column_max_to_be_between": column_profile.max = str(res["observed_value"]) elif exp == "expect_column_median_to_be_between": column_profile.median = str(res["observed_value"]) elif exp == "expect_column_stdev_to_be_between": column_profile.stdev = str(res["observed_value"]) elif exp == "expect_column_quantile_values_to_be_between": if "observed_value" in res: column_profile.quantiles = [ QuantileClass(quantile=str(quantile), value=str(value)) for quantile, value in zip( res["observed_value"]["quantiles"], res["observed_value"]["values"], ) ] elif exp == "expect_column_values_to_be_in_set": column_profile.sampleValues = [ str(v) for v in res["partial_unexpected_list"] ] if not send_sample_values: column_profile.sampleValues = [] elif exp == "expect_column_kl_divergence_to_be_less_than": if "details" in res and "observed_partition" in res["details"]: partition = res["details"]["observed_partition"] column_profile.histogram = HistogramClass( [str(v) for v in partition["bins"]], [ partition["tail_weights"][0], *partition["weights"], partition["tail_weights"][1], ], ) elif exp == "expect_column_distinct_values_to_be_in_set": if "details" in res and "value_counts" in res["details"]: # This can be used to produce a bar chart since it includes values and frequencies. # As such, it is handled differently from expect_column_values_to_be_in_set, which # is nonexhaustive. column_profile.distinctValueFrequencies = [ ValueFrequencyClass(value=str(value), frequency=count) for value, count in res["details"] ["value_counts"].items() ] if not send_sample_values: column_profile.distinctValueFrequencies = [] elif exp == "expect_column_values_to_be_in_type_list": # ignore; we already know the types for each column via ingestion pass elif exp == "expect_column_values_to_be_unique": # ignore; this is generally covered by the unique value count test pass else: self.report.report_warning( f"profile of {pretty_name}", f"warning: unknown column mapper {exp} in col {column}", )