def Correlation(self, column1, column2, where=None): result = self.AnalysisRunner.onData(self.df).addAnalyzer( Correlation(column1, column2, where)).run() result_df = AnalyzerContext.successMetricsAsDataFrame( self.spark, result) AnalyzerContext.successMetricsAsJson(self.spark, result) return result_df.select("value").collect()
def Completeness(self, column, where=None): result = self.AnalysisRunner.onData(self.df).addAnalyzer( Completeness(column, where)).run() result_df = AnalyzerContext.successMetricsAsDataFrame( self.spark, result) result_json = AnalyzerContext.successMetricsAsJson(self.spark, result) df_from_json = self.spark.read.json(self.sc.parallelize([result_json])) self.assertEqual( df_from_json.select("value").collect(), result_df.select("value").collect()) return result_df.select("value").collect()
def test_PatternMatch(self): result = (self.AnalysisRunner.onData(self.df).addAnalyzer( PatternMatch(column="a", pattern_regex="ba(r|z)")).run()) result_df = AnalyzerContext.successMetricsAsDataFrame( self.spark, result) result_json = AnalyzerContext.successMetricsAsJson(self.spark, result) df_from_json = self.spark.read.json(self.sc.parallelize([result_json])) self.assertEqual( df_from_json.select("value").collect(), result_df.select("value").collect()) self.assertEqual(result_df.select("value").collect(), [Row(value=0.0)])
def ApproxQuantile(self, column, quantile, where=None): relativeError: float = 0.01 result = (self.AnalysisRunner.onData(self.df).addAnalyzer( ApproxQuantile(column, quantile, relativeError, where)).run()) result_df = AnalyzerContext.successMetricsAsDataFrame( self.spark, result) result_json = AnalyzerContext.successMetricsAsJson(self.spark, result) df_from_json = self.spark.read.json(self.sc.parallelize([result_json])) self.assertEqual( df_from_json.select("value").collect(), result_df.select("value").collect()) return result_df.select("value").collect()
def Histogram_maxBins(self, column, binningUdf=None, maxDetailBins: int = None, where: str = None): result = (self.AnalysisRunner.onData(self.df).addAnalyzer( Histogram(column, binningUdf, maxDetailBins, where)).run()) result_df = AnalyzerContext.successMetricsAsDataFrame( self.spark, result) result_json = AnalyzerContext.successMetricsAsJson(self.spark, result) df_from_json = self.spark.read.json(self.sc.parallelize([result_json])) self.assertEqual( df_from_json.select("value").collect(), result_df.select("value").collect()) return result_df.select("value").collect()
def test_Size(self): result = self.AnalysisRunner.onData(self.df).addAnalyzer(Size()).run() # result_df = result.select('value').collect() result_df = AnalyzerContext.successMetricsAsDataFrame( self.spark, result) result_df_row = result_df.select("value").collect() self.assertEqual(result_df_row, [Row(value=3.0)])
def test_KLLSketch(self): result = (self.AnalysisRunner.onData(self.df).addAnalyzer( KLLSketch("b", KLLParameters(self.spark, 2, 0.64, 2))).run()) result_df = AnalyzerContext.successMetricsAsDataFrame( self.spark, result) result_df.show() return result_df.select("value").collect()