def Compliance(self, instance, predicate, where=None): result = self.AnalysisRunner.onData(self.df).addAnalyzer(Compliance(instance, predicate, where)).run() result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result) result_json = AnalyzerContext.successMetricsAsJson(self.spark, result) df_from_json = self.spark.read.json(self.sc.parallelize([result_json])) self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect()) return result_df.select("value").collect()
def UniqueValueRatio(self, columns, where=None): result = self.AnalysisRunner.onData(self.df).addAnalyzer(UniqueValueRatio(columns, where)).run() result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result) result_json = AnalyzerContext.successMetricsAsJson(self.spark, result) df_from_json = self.spark.read.json(self.sc.parallelize([result_json])) self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect()) return result_df.select("value").collect()
def ApproxQuantiles(self, column, quantiles): result = self.AnalysisRunner.onData(self.df).addAnalyzer(ApproxQuantiles(column, quantiles)).run() result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result) result_json = AnalyzerContext.successMetricsAsJson(self.spark, result) df_from_json = self.spark.read.json(self.sc.parallelize([result_json])) self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect()) return result_df.select("value").collect()
def test_KLLSketch(self): result = ( self.AnalysisRunner.onData(self.df).addAnalyzer(KLLSketch("b", KLLParameters(self.spark, 2, 0.64, 2))).run() ) result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result) result_df.show() return result_df.select("value").collect()
def test_PatternMatch(self): result = ( self.AnalysisRunner.onData(self.df).addAnalyzer(PatternMatch(column="a", pattern_regex="ba(r|z)")).run() ) result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result) result_json = AnalyzerContext.successMetricsAsJson(self.spark, result) df_from_json = self.spark.read.json(self.sc.parallelize([result_json])) self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect()) self.assertEqual(result_df.select("value").collect(), [Row(value=0.0)])
def Histogram_maxBins(self, column, binningUdf=None, maxDetailBins: int = None, where: str = None): result = ( self.AnalysisRunner.onData(self.df).addAnalyzer(Histogram(column, binningUdf, maxDetailBins, where)).run() ) result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result) result_json = AnalyzerContext.successMetricsAsJson(self.spark, result) df_from_json = self.spark.read.json(self.sc.parallelize([result_json])) self.assertEqual(df_from_json.select("value").collect(), result_df.select("value").collect()) return result_df.select("value").collect()
def ingest_table(self, full_path: str, relative_path: str, is_aws: bool) -> Iterable[MetadataWorkUnit]: table_name = self.get_table_name(relative_path, full_path) # yield the table schema first logger.debug( f"Ingesting {full_path}: making table schemas {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) yield from self.get_table_schema(full_path, table_name, is_aws) # If profiling is not enabled, skip the rest if not self.source_config.profiling.enabled: return # read in the whole table with Spark for profiling table = self.read_file_spark(full_path, is_aws) # if table is not readable, skip if table is None: self.report.report_warning( table_name, f"unable to read table {table_name} from file {full_path}") return with PerfTimer() as timer: # init PySpark analysis object logger.debug( f"Profiling {full_path}: reading file and computing nulls+uniqueness {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) table_profiler = _SingleTableProfiler( table, self.spark, self.source_config.profiling, self.report, full_path, ) logger.debug( f"Profiling {full_path}: preparing profilers to run {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) # instead of computing each profile individually, we run them all in a single analyzer.run() call # we use a single call because the analyzer optimizes the number of calls to the underlying profiler # since multiple profiles reuse computations, this saves a lot of time table_profiler.prepare_table_profiles() # compute the profiles logger.debug( f"Profiling {full_path}: computing profiles {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) analysis_result = table_profiler.analyzer.run() analysis_metrics = AnalyzerContext.successMetricsAsDataFrame( self.spark, analysis_result) logger.debug( f"Profiling {full_path}: extracting profiles {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) table_profiler.extract_table_profiles(analysis_metrics) time_taken = timer.elapsed_seconds() logger.info( f"Finished profiling {full_path}; took {time_taken:.3f} seconds" ) self.profiling_times_taken.append(time_taken) mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=make_dataset_urn(self.source_config.platform, table_name, self.source_config.env), changeType=ChangeTypeClass.UPSERT, aspectName="datasetProfile", aspect=table_profiler.profile, ) wu = MetadataWorkUnit( id=f"profile-{self.source_config.platform}-{full_path}", mcp=mcp) self.report.report_workunit(wu) yield wu
def get_table_profile(self, table_data: TableData, dataset_urn: str) -> Iterable[MetadataWorkUnit]: # read in the whole table with Spark for profiling table = None try: table = self.read_file_spark( table_data.table_path, os.path.splitext(table_data.full_path)[1]) except Exception as e: logger.error(e) # if table is not readable, skip if table is None: self.report.report_warning( table_data.display_name, f"unable to read table {table_data.display_name} from file {table_data.full_path}", ) return with PerfTimer() as timer: # init PySpark analysis object logger.debug( f"Profiling {table_data.full_path}: reading file and computing nulls+uniqueness {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) table_profiler = _SingleTableProfiler( table, self.spark, self.source_config.profiling, self.report, table_data.full_path, ) logger.debug( f"Profiling {table_data.full_path}: preparing profilers to run {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) # instead of computing each profile individually, we run them all in a single analyzer.run() call # we use a single call because the analyzer optimizes the number of calls to the underlying profiler # since multiple profiles reuse computations, this saves a lot of time table_profiler.prepare_table_profiles() # compute the profiles logger.debug( f"Profiling {table_data.full_path}: computing profiles {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) analysis_result = table_profiler.analyzer.run() analysis_metrics = AnalyzerContext.successMetricsAsDataFrame( self.spark, analysis_result) logger.debug( f"Profiling {table_data.full_path}: extracting profiles {datetime.now().strftime('%d/%m/%Y %H:%M:%S')}" ) table_profiler.extract_table_profiles(analysis_metrics) time_taken = timer.elapsed_seconds() logger.info( f"Finished profiling {table_data.full_path}; took {time_taken:.3f} seconds" ) self.profiling_times_taken.append(time_taken) mcp = MetadataChangeProposalWrapper( entityType="dataset", entityUrn=dataset_urn, changeType=ChangeTypeClass.UPSERT, aspectName="datasetProfile", aspect=table_profiler.profile, ) wu = MetadataWorkUnit( id=f"profile-{self.source_config.platform}-{table_data.table_path}", mcp=mcp) self.report.report_workunit(wu) yield wu
def test_Size(self): result = self.AnalysisRunner.onData(self.df).addAnalyzer(Size()).run() # result_df = result.select('value').collect() result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result) result_df_row = result_df.select("value").collect() self.assertEqual(result_df_row, [Row(value=3.0)])
def Correlation(self, column1, column2, where=None): result = self.AnalysisRunner.onData(self.df).addAnalyzer(Correlation(column1, column2, where)).run() result_df = AnalyzerContext.successMetricsAsDataFrame(self.spark, result) AnalyzerContext.successMetricsAsJson(self.spark, result) return result_df.select("value").collect()