def _get_dataset_column_sample_values( self, column_profile: DatasetFieldProfileClass, column: str) -> None: if self.config.include_field_sample_values: # TODO do this without GE self.dataset.set_config_value("interactive_evaluation", True) res = self.dataset.expect_column_values_to_be_in_set( column, [], result_format="SUMMARY").result column_profile.sampleValues = [ str(v) for v in res["partial_unexpected_list"] ]
def __init__( self, dataframe: DataFrame, spark: SparkSession, profiling_config: DataLakeProfilerConfig, report: DataLakeSourceReport, file_path: str, ): self.spark = spark self.dataframe = dataframe self.analyzer = AnalysisRunner(spark).onData(dataframe) self.column_specs = [] self.row_count = dataframe.count() self.profiling_config = profiling_config self.file_path = file_path self.columns_to_profile = [] self.ignored_columns = [] self.profile = DatasetProfileClass(timestampMillis=get_sys_time()) self.report = report self.profile.rowCount = self.row_count self.profile.columnCount = len(dataframe.columns) column_types = {x.name: x.dataType for x in dataframe.schema.fields} if self.profiling_config.profile_table_level_only: return # get column distinct counts for column in dataframe.columns: if not self.profiling_config.allow_deny_patterns.allowed(column): self.ignored_columns.append(column) continue self.columns_to_profile.append(column) # Normal CountDistinct is ridiculously slow self.analyzer.addAnalyzer(ApproxCountDistinct(column)) if self.profiling_config.max_number_of_fields_to_profile is not None: if (len(self.columns_to_profile) > self.profiling_config.max_number_of_fields_to_profile): columns_being_dropped = self.columns_to_profile[ self.profiling_config.max_number_of_fields_to_profile:] self.columns_to_profile = self.columns_to_profile[:self. profiling_config . max_number_of_fields_to_profile] self.report.report_file_dropped( f"The max_number_of_fields_to_profile={self.profiling_config.max_number_of_fields_to_profile} reached. Profile of columns {self.file_path}({', '.join(sorted(columns_being_dropped))})" ) analysis_result = self.analyzer.run() analysis_metrics = AnalyzerContext.successMetricsAsJson( self.spark, analysis_result) # reshape distinct counts into dictionary column_distinct_counts = { x["instance"]: int(x["value"]) for x in analysis_metrics if x["name"] == "ApproxCountDistinct" } select_numeric_null_counts = [ count(when( isnan(c) | col(c).isNull(), c, )).alias(c) for c in self.columns_to_profile if column_types[column] in [DoubleType, FloatType] ] # PySpark doesn't support isnan() on non-float/double columns select_nonnumeric_null_counts = [ count(when( col(c).isNull(), c, )).alias(c) for c in self.columns_to_profile if column_types[column] not in [DoubleType, FloatType] ] null_counts = dataframe.select(select_numeric_null_counts + select_nonnumeric_null_counts) column_null_counts = null_counts.toPandas().T[0].to_dict() column_null_fractions = { c: column_null_counts[c] / self.row_count for c in self.columns_to_profile } column_nonnull_counts = { c: self.row_count - column_null_counts[c] for c in self.columns_to_profile } column_unique_proportions = { c: (column_distinct_counts[c] / column_nonnull_counts[c] if column_nonnull_counts[c] > 0 else 0) for c in self.columns_to_profile } if self.profiling_config.include_field_sample_values: # take sample and convert to Pandas DataFrame if self.row_count < NUM_SAMPLE_ROWS: # if row count is less than number to sample, just take all rows rdd_sample = dataframe.rdd.take(self.row_count) else: rdd_sample = dataframe.rdd.takeSample(False, NUM_SAMPLE_ROWS, seed=0) # init column specs with profiles for column in self.columns_to_profile: column_profile = DatasetFieldProfileClass(fieldPath=column) column_spec = _SingleColumnSpec(column, column_profile) column_profile.uniqueCount = column_distinct_counts.get(column) column_profile.uniqueProportion = column_unique_proportions.get( column) column_profile.nullCount = column_null_counts.get(column) column_profile.nullProportion = column_null_fractions.get(column) if self.profiling_config.include_field_sample_values: column_profile.sampleValues = [ str(x[column]) for x in rdd_sample ] column_spec.type_ = column_types[column] column_spec.cardinality = _convert_to_cardinality( column_distinct_counts[column], column_null_fractions[column], ) self.column_specs.append(column_spec)
def _handle_convert_column_evrs( # noqa: C901 (complexity) self, profile: DatasetProfileClass, column: str, col_evrs: Iterable[ExpectationValidationResult], pretty_name: str, send_sample_values: bool, ) -> None: # TRICKY: This method mutates the profile directly. column_profile = DatasetFieldProfileClass(fieldPath=column) profile.fieldProfiles = profile.fieldProfiles or [] profile.fieldProfiles.append(column_profile) for evr in col_evrs: exp: str = evr.expectation_config.expectation_type res: dict = evr.result if not res: self.report.report_warning(f"profile of {pretty_name}", f"{exp} did not yield any results") continue if exp == "expect_column_unique_value_count_to_be_between": column_profile.uniqueCount = res["observed_value"] elif exp == "expect_column_proportion_of_unique_values_to_be_between": column_profile.uniqueProportion = res["observed_value"] elif exp == "expect_column_values_to_not_be_null": column_profile.nullCount = res["unexpected_count"] if ("unexpected_percent" in res and res["unexpected_percent"] is not None): column_profile.nullProportion = res[ "unexpected_percent"] / 100 elif exp == "expect_column_values_to_not_match_regex": # ignore; generally used for whitespace checks using regex r"^\s+|\s+$" pass elif exp == "expect_column_mean_to_be_between": column_profile.mean = str(res["observed_value"]) elif exp == "expect_column_min_to_be_between": column_profile.min = str(res["observed_value"]) elif exp == "expect_column_max_to_be_between": column_profile.max = str(res["observed_value"]) elif exp == "expect_column_median_to_be_between": column_profile.median = str(res["observed_value"]) elif exp == "expect_column_stdev_to_be_between": column_profile.stdev = str(res["observed_value"]) elif exp == "expect_column_quantile_values_to_be_between": if "observed_value" in res: column_profile.quantiles = [ QuantileClass(quantile=str(quantile), value=str(value)) for quantile, value in zip( res["observed_value"]["quantiles"], res["observed_value"]["values"], ) ] elif exp == "expect_column_values_to_be_in_set": column_profile.sampleValues = [ str(v) for v in res["partial_unexpected_list"] ] if not send_sample_values: column_profile.sampleValues = [] elif exp == "expect_column_kl_divergence_to_be_less_than": if "details" in res and "observed_partition" in res["details"]: partition = res["details"]["observed_partition"] column_profile.histogram = HistogramClass( [str(v) for v in partition["bins"]], [ partition["tail_weights"][0], *partition["weights"], partition["tail_weights"][1], ], ) elif exp == "expect_column_distinct_values_to_be_in_set": if "details" in res and "value_counts" in res["details"]: # This can be used to produce a bar chart since it includes values and frequencies. # As such, it is handled differently from expect_column_values_to_be_in_set, which # is nonexhaustive. column_profile.distinctValueFrequencies = [ ValueFrequencyClass(value=str(value), frequency=count) for value, count in res["details"] ["value_counts"].items() ] if not send_sample_values: column_profile.distinctValueFrequencies = [] elif exp == "expect_column_values_to_be_in_type_list": # ignore; we already know the types for each column via ingestion pass elif exp == "expect_column_values_to_be_unique": # ignore; this is generally covered by the unique value count test pass else: self.report.report_warning( f"profile of {pretty_name}", f"warning: unknown column mapper {exp} in col {column}", )