def test_Size(self):
     result = self.AnalysisRunner.onData(self.df).addAnalyzer(Size()).run()
     # result_df = result.select('value').collect()
     result_df = AnalyzerContext.successMetricsAsDataFrame(
         self.spark, result)
     result_df_row = result_df.select("value").collect()
     self.assertEqual(result_df_row, [Row(value=3.0)])
 def test_KLLSketch(self):
     result = (self.AnalysisRunner.onData(self.df).addAnalyzer(
         KLLSketch("b", KLLParameters(self.spark, 2, 0.64, 2))).run())
     result_df = AnalyzerContext.successMetricsAsDataFrame(
         self.spark, result)
     result_df.show()
     return result_df.select("value").collect()
 def Correlation(self, column1, column2, where=None):
     result = self.AnalysisRunner.onData(self.df).addAnalyzer(
         Correlation(column1, column2, where)).run()
     result_df = AnalyzerContext.successMetricsAsDataFrame(
         self.spark, result)
     AnalyzerContext.successMetricsAsJson(self.spark, result)
     return result_df.select("value").collect()
 def Completeness(self, column, where=None):
     result = self.AnalysisRunner.onData(self.df).addAnalyzer(
         Completeness(column, where)).run()
     result_df = AnalyzerContext.successMetricsAsDataFrame(
         self.spark, result)
     result_json = AnalyzerContext.successMetricsAsJson(self.spark, result)
     df_from_json = self.spark.read.json(self.sc.parallelize([result_json]))
     self.assertEqual(
         df_from_json.select("value").collect(),
         result_df.select("value").collect())
     return result_df.select("value").collect()
 def test_PatternMatch(self):
     result = (self.AnalysisRunner.onData(self.df).addAnalyzer(
         PatternMatch(column="a", pattern_regex="ba(r|z)")).run())
     result_df = AnalyzerContext.successMetricsAsDataFrame(
         self.spark, result)
     result_json = AnalyzerContext.successMetricsAsJson(self.spark, result)
     df_from_json = self.spark.read.json(self.sc.parallelize([result_json]))
     self.assertEqual(
         df_from_json.select("value").collect(),
         result_df.select("value").collect())
     self.assertEqual(result_df.select("value").collect(), [Row(value=0.0)])
 def ApproxQuantile(self, column, quantile, where=None):
     relativeError: float = 0.01
     result = (self.AnalysisRunner.onData(self.df).addAnalyzer(
         ApproxQuantile(column, quantile, relativeError, where)).run())
     result_df = AnalyzerContext.successMetricsAsDataFrame(
         self.spark, result)
     result_json = AnalyzerContext.successMetricsAsJson(self.spark, result)
     df_from_json = self.spark.read.json(self.sc.parallelize([result_json]))
     self.assertEqual(
         df_from_json.select("value").collect(),
         result_df.select("value").collect())
     return result_df.select("value").collect()
 def Histogram_maxBins(self,
                       column,
                       binningUdf=None,
                       maxDetailBins: int = None,
                       where: str = None):
     result = (self.AnalysisRunner.onData(self.df).addAnalyzer(
         Histogram(column, binningUdf, maxDetailBins, where)).run())
     result_df = AnalyzerContext.successMetricsAsDataFrame(
         self.spark, result)
     result_json = AnalyzerContext.successMetricsAsJson(self.spark, result)
     df_from_json = self.spark.read.json(self.sc.parallelize([result_json]))
     self.assertEqual(
         df_from_json.select("value").collect(),
         result_df.select("value").collect())
     return result_df.select("value").collect()