예제 #1
0
def main():
     # SparkSession startup
     spark = (SparkSession
               .builder
               .master('local[*]')
               .config('spark.jars.packages',
                       'com.amazon.deequ:deequ:1.0.5')
               .appName('suggestions-example')
               .getOrCreate())
     df = spark.createDataFrame(test_data)
     # Analysis run
     a = (AnalysisRunner(spark)
          .onData(df)
          .addAnalyzer(analyzers.Size())) \
          .run()
     key = ResultKey(spark, 100000, {'key1': 'value1'})
     myrepo = FileSystemMetricsRepository(spark, '../test.json')
     myrepo.save(key, a)

     # Verification run
     key2 = repo.ResultKey(spark, 100000, {'key1': 'value2', 'key2':'value3'})
     

     v = (base.VerificationSuite(spark)
              .onData(df)
              .addCheck(Check(spark, 'error', 'examples')
                        .hasSize(lambda x: x == 8)
                        .isUnique('_2'))
          .useRepository(myrepo)
          .saveOrAppendResult(key2)
          .run()
     )

     myrepo.load().withTagValues({'key1': 'value1'}).after(99000) \
          .getMetricsAsDF().show()

     # SparkSession and Java Gateway teardown
     spark.sparkContext._gateway.close()
     spark.stop()
예제 #2
0
def main():
    # SparkSession startup
    spark = (SparkSession.builder.master('local[*]').config(
        'spark.jars.packages', 'com.amazon.deequ:deequ:1.0.5').appName(
            'profiler-example').getOrCreate())
    df = spark.createDataFrame(test_data)

    r = AnalysisRunner(spark) \
     .onData(df) \
        .addAnalyzer(analyzers.Size()) \
        .addAnalyzer(analyzers.Completeness('_3')) \
     .addAnalyzer(analyzers.ApproxCountDistinct('_1')) \
        .addAnalyzer(analyzers.Mean('_2')) \
        .addAnalyzer(analyzers.Compliance('top values', '_2 > 15')) \
        .addAnalyzer(analyzers.Correlation('_2', '_5')) \
     .run()

    df = DataFrame(r, spark)
    df.show(df.count(), False)

    # SparkSession and Java Gateway teardown
    spark.sparkContext._gateway.close()
    spark.stop()
예제 #3
0
 def test_Size(self):
     out = self.runner.onData(self.df) \
         .addAnalyzer(analyzers.Size()) \
         .run().successMetricsAsDataFrame()
     out = out.select('value').collect()
     self.assertEqual(out, [Row(value=8)])
예제 #4
0
 def test_Size(self):
     out = self.runner.onData(self.df) \
         .addAnalyzer(analyzers.Size()) \
         .run()
     out = DataFrame(out, self.spark).select('value').collect()
     self.assertEqual(out, [Row(value=8)])