class _SingleTableProfiler: spark: SparkSession dataframe: DataFrame analyzer: AnalysisRunBuilder column_specs: List[_SingleColumnSpec] row_count: int profiling_config: DataLakeProfilerConfig file_path: str columns_to_profile: List[str] ignored_columns: List[str] profile: DatasetProfileClass report: DataLakeSourceReport def __init__( self, dataframe: DataFrame, spark: SparkSession, profiling_config: DataLakeProfilerConfig, report: DataLakeSourceReport, file_path: str, ): self.spark = spark self.dataframe = dataframe self.analyzer = AnalysisRunner(spark).onData(dataframe) self.column_specs = [] self.row_count = dataframe.count() self.profiling_config = profiling_config self.file_path = file_path self.columns_to_profile = [] self.ignored_columns = [] self.profile = DatasetProfileClass(timestampMillis=get_sys_time()) self.report = report self.profile.rowCount = self.row_count self.profile.columnCount = len(dataframe.columns) column_types = {x.name: x.dataType for x in dataframe.schema.fields} if self.profiling_config.profile_table_level_only: return # get column distinct counts for column in dataframe.columns: if not self.profiling_config.allow_deny_patterns.allowed(column): self.ignored_columns.append(column) continue self.columns_to_profile.append(column) # Normal CountDistinct is ridiculously slow self.analyzer.addAnalyzer(ApproxCountDistinct(column)) if self.profiling_config.max_number_of_fields_to_profile is not None: if (len(self.columns_to_profile) > self.profiling_config.max_number_of_fields_to_profile): columns_being_dropped = self.columns_to_profile[ self.profiling_config.max_number_of_fields_to_profile:] self.columns_to_profile = self.columns_to_profile[:self. profiling_config . max_number_of_fields_to_profile] self.report.report_file_dropped( f"The max_number_of_fields_to_profile={self.profiling_config.max_number_of_fields_to_profile} reached. Profile of columns {self.file_path}({', '.join(sorted(columns_being_dropped))})" ) analysis_result = self.analyzer.run() analysis_metrics = AnalyzerContext.successMetricsAsJson( self.spark, analysis_result) # reshape distinct counts into dictionary column_distinct_counts = { x["instance"]: int(x["value"]) for x in analysis_metrics if x["name"] == "ApproxCountDistinct" } select_numeric_null_counts = [ count(when( isnan(c) | col(c).isNull(), c, )).alias(c) for c in self.columns_to_profile if column_types[column] in [DoubleType, FloatType] ] # PySpark doesn't support isnan() on non-float/double columns select_nonnumeric_null_counts = [ count(when( col(c).isNull(), c, )).alias(c) for c in self.columns_to_profile if column_types[column] not in [DoubleType, FloatType] ] null_counts = dataframe.select(select_numeric_null_counts + select_nonnumeric_null_counts) column_null_counts = null_counts.toPandas().T[0].to_dict() column_null_fractions = { c: column_null_counts[c] / self.row_count for c in self.columns_to_profile } column_nonnull_counts = { c: self.row_count - column_null_counts[c] for c in self.columns_to_profile } column_unique_proportions = { c: (column_distinct_counts[c] / column_nonnull_counts[c] if column_nonnull_counts[c] > 0 else 0) for c in self.columns_to_profile } if self.profiling_config.include_field_sample_values: # take sample and convert to Pandas DataFrame if self.row_count < NUM_SAMPLE_ROWS: # if row count is less than number to sample, just take all rows rdd_sample = dataframe.rdd.take(self.row_count) else: rdd_sample = dataframe.rdd.takeSample(False, NUM_SAMPLE_ROWS, seed=0) # init column specs with profiles for column in self.columns_to_profile: column_profile = DatasetFieldProfileClass(fieldPath=column) column_spec = _SingleColumnSpec(column, column_profile) column_profile.uniqueCount = column_distinct_counts.get(column) column_profile.uniqueProportion = column_unique_proportions.get( column) column_profile.nullCount = column_null_counts.get(column) column_profile.nullProportion = column_null_fractions.get(column) if self.profiling_config.include_field_sample_values: column_profile.sampleValues = [ str(x[column]) for x in rdd_sample ] column_spec.type_ = column_types[column] column_spec.cardinality = _convert_to_cardinality( column_distinct_counts[column], column_null_fractions[column], ) self.column_specs.append(column_spec) def prep_min_value(self, column: str) -> None: if self.profiling_config.include_field_min_value: self.analyzer.addAnalyzer(Minimum(column)) def prep_max_value(self, column: str) -> None: if self.profiling_config.include_field_max_value: self.analyzer.addAnalyzer(Maximum(column)) def prep_mean_value(self, column: str) -> None: if self.profiling_config.include_field_mean_value: self.analyzer.addAnalyzer(Mean(column)) def prep_median_value(self, column: str) -> None: if self.profiling_config.include_field_median_value: self.analyzer.addAnalyzer(ApproxQuantile(column, 0.5)) def prep_stdev_value(self, column: str) -> None: if self.profiling_config.include_field_stddev_value: self.analyzer.addAnalyzer(StandardDeviation(column)) def prep_quantiles(self, column: str) -> None: if self.profiling_config.include_field_quantiles: self.analyzer.addAnalyzer(ApproxQuantiles(column, QUANTILES)) def prep_distinct_value_frequencies(self, column: str) -> None: if self.profiling_config.include_field_distinct_value_frequencies: self.analyzer.addAnalyzer(Histogram(column)) def prep_field_histogram(self, column: str) -> None: if self.profiling_config.include_field_histogram: self.analyzer.addAnalyzer( Histogram(column, maxDetailBins=MAX_HIST_BINS)) def prepare_table_profiles(self) -> None: row_count = self.row_count telemetry.telemetry_instance.ping( "profile_data_lake_table", # bucket by taking floor of log of the number of rows scanned {"rows_profiled": 10**int(log10(row_count + 1))}, ) # loop through the columns and add the analyzers for column_spec in self.column_specs: column = column_spec.column column_profile = column_spec.column_profile type_ = column_spec.type_ cardinality = column_spec.cardinality non_null_count = column_spec.non_null_count unique_count = column_spec.unique_count if (self.profiling_config.include_field_null_count and non_null_count is not None): null_count = row_count - non_null_count assert null_count >= 0 column_profile.nullCount = null_count if row_count > 0: column_profile.nullProportion = null_count / row_count if unique_count is not None: column_profile.uniqueCount = unique_count if non_null_count is not None and non_null_count > 0: column_profile.uniqueProportion = unique_count / non_null_count if isinstance( type_, ( DecimalType, DoubleType, FloatType, IntegerType, LongType, ShortType, ), ): if cardinality == Cardinality.UNIQUE: pass elif cardinality in [ Cardinality.ONE, Cardinality.TWO, Cardinality.VERY_FEW, Cardinality.FEW, ]: column_spec.histogram_distinct = True self.prep_distinct_value_frequencies(column) elif cardinality in [ Cardinality.MANY, Cardinality.VERY_MANY, Cardinality.UNIQUE, ]: column_spec.histogram_distinct = False self.prep_min_value(column) self.prep_max_value(column) self.prep_mean_value(column) self.prep_median_value(column) self.prep_stdev_value(column) self.prep_quantiles(column) self.prep_field_histogram(column) else: # unknown cardinality - skip pass elif isinstance(type_, StringType): if cardinality in [ Cardinality.ONE, Cardinality.TWO, Cardinality.VERY_FEW, Cardinality.FEW, ]: column_spec.histogram_distinct = True self.prep_distinct_value_frequencies(column, ) elif isinstance(type_, (DateType, TimestampType)): self.prep_min_value(column) self.prep_max_value(column) # FIXME: Re-add histogram once kl_divergence has been modified to support datetimes if cardinality in [ Cardinality.ONE, Cardinality.TWO, Cardinality.VERY_FEW, Cardinality.FEW, ]: self.prep_distinct_value_frequencies(column, ) def extract_table_profiles( self, analysis_metrics: DataFrame, ) -> None: self.profile.fieldProfiles = [] analysis_metrics = analysis_metrics.toPandas() # DataFrame with following columns: # entity: "Column" for column profile, "Table" for table profile # instance: name of column being profiled. "*" for table profiles # name: name of metric. Histogram metrics are formatted as "Histogram.<metric>.<value>" # value: value of metric column_metrics = analysis_metrics[analysis_metrics["entity"] == "Column"] # resolve histogram types for grouping column_metrics["kind"] = column_metrics["name"].apply( lambda x: "Histogram" if x.startswith("Histogram.") else x) column_histogram_metrics = column_metrics[column_metrics["kind"] == "Histogram"] column_nonhistogram_metrics = column_metrics[ column_metrics["kind"] != "Histogram"] histogram_columns = set() if len(column_histogram_metrics) > 0: # we only want the absolute counts for each histogram for now column_histogram_metrics = column_histogram_metrics[ column_histogram_metrics["name"].apply( lambda x: x.startswith("Histogram.abs."))] # get the histogram bins by chopping off the "Histogram.abs." prefix column_histogram_metrics["bin"] = column_histogram_metrics[ "name"].apply(lambda x: x[14:]) # reshape histogram counts for easier access histogram_counts = column_histogram_metrics.set_index( ["instance", "bin"])["value"] histogram_columns = set(histogram_counts.index.get_level_values(0)) profiled_columns = set() if len(column_nonhistogram_metrics) > 0: # reshape other metrics for easier access nonhistogram_metrics = column_nonhistogram_metrics.set_index( ["instance", "name"])["value"] profiled_columns = set( nonhistogram_metrics.index.get_level_values(0)) # histogram_columns = set(histogram_counts.index.get_level_values(0)) for column_spec in self.column_specs: column = column_spec.column column_profile = column_spec.column_profile if column not in profiled_columns: continue # convert to Dict so we can use .get deequ_column_profile = nonhistogram_metrics.loc[column].to_dict() # uniqueCount, uniqueProportion, nullCount, nullProportion, sampleValues already set in TableWrapper column_profile.min = null_str(deequ_column_profile.get("Minimum")) column_profile.max = null_str(deequ_column_profile.get("Maximum")) column_profile.mean = null_str(deequ_column_profile.get("Mean")) column_profile.median = null_str( deequ_column_profile.get("ApproxQuantiles-0.5")) column_profile.stdev = null_str( deequ_column_profile.get("StandardDeviation")) if all( deequ_column_profile.get(f"ApproxQuantiles-{quantile}") is not None for quantile in QUANTILES): column_profile.quantiles = [ QuantileClass( quantile=str(quantile), value=str( deequ_column_profile[f"ApproxQuantiles-{quantile}"] ), ) for quantile in QUANTILES ] if column in histogram_columns: column_histogram = histogram_counts.loc[column] # sort so output is deterministic column_histogram = column_histogram.sort_index() if column_spec.histogram_distinct: column_profile.distinctValueFrequencies = [ ValueFrequencyClass(value=value, frequency=int( column_histogram.loc[value])) for value in column_histogram.index ] # sort so output is deterministic column_profile.distinctValueFrequencies = sorted( column_profile.distinctValueFrequencies, key=lambda x: x.value) else: column_profile.histogram = HistogramClass( [str(x) for x in column_histogram.index], [float(x) for x in column_histogram], ) # append the column profile to the dataset profile self.profile.fieldProfiles.append(column_profile)
def __init__( self, dataframe: DataFrame, spark: SparkSession, profiling_config: DataLakeProfilerConfig, report: DataLakeSourceReport, file_path: str, ): self.spark = spark self.dataframe = dataframe self.analyzer = AnalysisRunner(spark).onData(dataframe) self.column_specs = [] self.row_count = dataframe.count() self.profiling_config = profiling_config self.file_path = file_path self.columns_to_profile = [] self.ignored_columns = [] self.profile = DatasetProfileClass(timestampMillis=get_sys_time()) self.report = report self.profile.rowCount = self.row_count self.profile.columnCount = len(dataframe.columns) column_types = {x.name: x.dataType for x in dataframe.schema.fields} if self.profiling_config.profile_table_level_only: return # get column distinct counts for column in dataframe.columns: if not self.profiling_config.allow_deny_patterns.allowed(column): self.ignored_columns.append(column) continue self.columns_to_profile.append(column) # Normal CountDistinct is ridiculously slow self.analyzer.addAnalyzer(ApproxCountDistinct(column)) if self.profiling_config.max_number_of_fields_to_profile is not None: if (len(self.columns_to_profile) > self.profiling_config.max_number_of_fields_to_profile): columns_being_dropped = self.columns_to_profile[ self.profiling_config.max_number_of_fields_to_profile:] self.columns_to_profile = self.columns_to_profile[:self. profiling_config . max_number_of_fields_to_profile] self.report.report_file_dropped( f"The max_number_of_fields_to_profile={self.profiling_config.max_number_of_fields_to_profile} reached. Profile of columns {self.file_path}({', '.join(sorted(columns_being_dropped))})" ) analysis_result = self.analyzer.run() analysis_metrics = AnalyzerContext.successMetricsAsJson( self.spark, analysis_result) # reshape distinct counts into dictionary column_distinct_counts = { x["instance"]: int(x["value"]) for x in analysis_metrics if x["name"] == "ApproxCountDistinct" } select_numeric_null_counts = [ count(when( isnan(c) | col(c).isNull(), c, )).alias(c) for c in self.columns_to_profile if column_types[column] in [DoubleType, FloatType] ] # PySpark doesn't support isnan() on non-float/double columns select_nonnumeric_null_counts = [ count(when( col(c).isNull(), c, )).alias(c) for c in self.columns_to_profile if column_types[column] not in [DoubleType, FloatType] ] null_counts = dataframe.select(select_numeric_null_counts + select_nonnumeric_null_counts) column_null_counts = null_counts.toPandas().T[0].to_dict() column_null_fractions = { c: column_null_counts[c] / self.row_count for c in self.columns_to_profile } column_nonnull_counts = { c: self.row_count - column_null_counts[c] for c in self.columns_to_profile } column_unique_proportions = { c: (column_distinct_counts[c] / column_nonnull_counts[c] if column_nonnull_counts[c] > 0 else 0) for c in self.columns_to_profile } if self.profiling_config.include_field_sample_values: # take sample and convert to Pandas DataFrame if self.row_count < NUM_SAMPLE_ROWS: # if row count is less than number to sample, just take all rows rdd_sample = dataframe.rdd.take(self.row_count) else: rdd_sample = dataframe.rdd.takeSample(False, NUM_SAMPLE_ROWS, seed=0) # init column specs with profiles for column in self.columns_to_profile: column_profile = DatasetFieldProfileClass(fieldPath=column) column_spec = _SingleColumnSpec(column, column_profile) column_profile.uniqueCount = column_distinct_counts.get(column) column_profile.uniqueProportion = column_unique_proportions.get( column) column_profile.nullCount = column_null_counts.get(column) column_profile.nullProportion = column_null_fractions.get(column) if self.profiling_config.include_field_sample_values: column_profile.sampleValues = [ str(x[column]) for x in rdd_sample ] column_spec.type_ = column_types[column] column_spec.cardinality = _convert_to_cardinality( column_distinct_counts[column], column_null_fractions[column], ) self.column_specs.append(column_spec)
def createAnalysisRunner(self): return AnalysisRunner(self._spark_session)