def main(): # SparkSession startup spark = (SparkSession .builder .master('local[*]') .config('spark.jars.packages', 'com.amazon.deequ:deequ:1.0.5') .appName('suggestions-example') .getOrCreate()) df = spark.createDataFrame(test_data) # Analysis run a = (AnalysisRunner(spark) .onData(df) .addAnalyzer(analyzers.Size())) \ .run() key = ResultKey(spark, 100000, {'key1': 'value1'}) myrepo = FileSystemMetricsRepository(spark, '../test.json') myrepo.save(key, a) # Verification run key2 = repo.ResultKey(spark, 100000, {'key1': 'value2', 'key2':'value3'}) v = (base.VerificationSuite(spark) .onData(df) .addCheck(Check(spark, 'error', 'examples') .hasSize(lambda x: x == 8) .isUnique('_2')) .useRepository(myrepo) .saveOrAppendResult(key2) .run() ) myrepo.load().withTagValues({'key1': 'value1'}).after(99000) \ .getMetricsAsDF().show() # SparkSession and Java Gateway teardown spark.sparkContext._gateway.close() spark.stop()
def main(): # SparkSession startup spark = (SparkSession.builder.master('local[*]').config( 'spark.jars.packages', 'com.amazon.deequ:deequ:1.0.5').appName( 'profiler-example').getOrCreate()) df = spark.createDataFrame(test_data) r = AnalysisRunner(spark) \ .onData(df) \ .addAnalyzer(analyzers.Size()) \ .addAnalyzer(analyzers.Completeness('_3')) \ .addAnalyzer(analyzers.ApproxCountDistinct('_1')) \ .addAnalyzer(analyzers.Mean('_2')) \ .addAnalyzer(analyzers.Compliance('top values', '_2 > 15')) \ .addAnalyzer(analyzers.Correlation('_2', '_5')) \ .run() df = DataFrame(r, spark) df.show(df.count(), False) # SparkSession and Java Gateway teardown spark.sparkContext._gateway.close() spark.stop()
def test_Size(self): out = self.runner.onData(self.df) \ .addAnalyzer(analyzers.Size()) \ .run().successMetricsAsDataFrame() out = out.select('value').collect() self.assertEqual(out, [Row(value=8)])
def test_Size(self): out = self.runner.onData(self.df) \ .addAnalyzer(analyzers.Size()) \ .run() out = DataFrame(out, self.spark).select('value').collect() self.assertEqual(out, [Row(value=8)])