Пример #1
0
def main():
    # SparkSession startup
    spark = (SparkSession
              .builder
              .master('local[*]')
              .config('spark.jars.packages',
                      'com.amazon.deequ:deequ:1.0.5')
              .appName('constrain-example')
              .getOrCreate())
    df = spark.createDataFrame(test_data)

    # Constrain verification
    r = (VerificationSuite(spark)
         .onData(df)
         .addCheck(Check(spark, 'error', 'examples')
                   .hasSize(lambda x: x == 8)
                   .isUnique('_2')
                   .hasCompleteness('_2', lambda x: x >= 0.75)
                   .hasUniqueness('_1', lambda x: x == 3/8)
                   .hasDistinctness('_1', lambda x: x == 5/8)
                   .hasUniqueValueRatio('_2', lambda x: x == 0.8)
                   .hasNumberOfDistinctValues('_2', lambda x: x == 6)
                   #.hasHistogram
                   .hasEntropy('_3', lambda x: x > 1)
                   #.hasMutualInformation('_2', '_3', lambda x: x > 0.5)
                   .hasApproxQuantile('_2', 0.5, lambda x: x == 7)
                   .hasMinLength('_1', lambda x: x == 6)
                   .hasMaxLength('_3', lambda x: x == 10)
                   .hasMin('_2', lambda x: x == 1)
                   .hasMax('_2', lambda x: x == 20)
                   .hasMean('_2', lambda x: x > 10)
                   .hasSum('_2', lambda x: x > 50)
                   .hasStandardDeviation('_2', lambda x: x > 5)
                   .hasApproxCountDistinct('_2', lambda x: x == 5)
                   .hasCorrelation('_2', '_5', lambda x: x == 1)
                   .satisfies("_2 > 15", "MyCondition", lambda x: x == 0.25)
                   #.hasPattern("_1", "thing([A-Z])", lambda x: x == 1)
                   #.hasDataType("_1", "string", lambda x: x == 1)
                   .isPositive('_2')
                   .isNonNegative('_2')
                   .isLessThan('_5', '_2', lambda x: x == 0.375)
                   .isLessThanOrEqualTo('_5', '_2', lambda x: x == 0.375)
                   .isGreaterThan('_5', '_2', lambda x: x == 0.125)
                   .isGreaterThanOrEqualTo('_5', '_2', lambda x: x == 0.125)
                   #.isContainedIn('_3', ['DELAYED', 'INTRANSIT'])
                   .isInInterval('_5', 1.0, 50.0)
                   )
         .run()
         )
    df = DataFrame(r, spark)
    df.show(df.count(), False)

    # SparkSession and Java Gateway teardown
    spark.sparkContext._gateway.close()
    spark.stop()
Пример #2
0
def main():
     # SparkSession startup
     spark = (SparkSession
               .builder
               .master('local[*]')
               .config('spark.jars.packages',
                       'com.amazon.deequ:deequ:1.0.5')
               .appName('suggestions-example')
               .getOrCreate())
     df = spark.createDataFrame(test_data)
     # Analysis run
     a = (AnalysisRunner(spark)
          .onData(df)
          .addAnalyzer(analyzers.Size())) \
          .run()
     key = ResultKey(spark, 100000, {'key1': 'value1'})
     myrepo = FileSystemMetricsRepository(spark, '../test.json')
     myrepo.save(key, a)

     # Verification run
     key2 = repo.ResultKey(spark, 100000, {'key1': 'value2', 'key2':'value3'})
     

     v = (base.VerificationSuite(spark)
              .onData(df)
              .addCheck(Check(spark, 'error', 'examples')
                        .hasSize(lambda x: x == 8)
                        .isUnique('_2'))
          .useRepository(myrepo)
          .saveOrAppendResult(key2)
          .run()
     )

     myrepo.load().withTagValues({'key1': 'value1'}).after(99000) \
          .getMetricsAsDF().show()

     # SparkSession and Java Gateway teardown
     spark.sparkContext._gateway.close()
     spark.stop()
Пример #3
0
 def test_hasMaxLength(self):
     chk = Check(self.spark) \
                .hasMaxLength('_3', lambda x: x == 10)
     out = self.suite.onData(self.df).addCheck(chk).run()
     out = DataFrame(out, self.spark).select('constraint_status').collect()
     self.assertEqual(out, [self.success])
Пример #4
0
 def test_hasApproxQuantile(self):
     chk = Check(self.spark) \
                .hasApproxQuantile('_2', 0.5, lambda x: x == 7)
     out = self.suite.onData(self.df).addCheck(chk).run()
     out = DataFrame(out, self.spark).select('constraint_status').collect()
     self.assertEqual(out, [self.success])
Пример #5
0
 def test_hasNumberOfDistinctValues(self):
     chk = Check(self.spark) \
                .hasNumberOfDistinctValues('_2', lambda x: x == 6)
     out = self.suite.onData(self.df).addCheck(chk).run()
     out = DataFrame(out, self.spark).select('constraint_status').collect()
     self.assertEqual(out, [self.success])
Пример #6
0
 def test_hasUniqueValueRatio(self):
     chk = Check(self.spark) \
                .hasUniqueValueRatio('_2', lambda x: x == 0.8)
     out = self.suite.onData(self.df).addCheck(chk).run()
     out = DataFrame(out, self.spark).select('constraint_status').collect()
     self.assertEqual(out, [self.success])
Пример #7
0
 def test_hasCompleteness(self):
     chk = Check(self.spark) \
                .hasCompleteness('_2', lambda x: x >= 0.75)
     out = self.suite.onData(self.df).addCheck(chk).run()
     out = DataFrame(out, self.spark).select('constraint_status').collect()
     self.assertEqual(out, [self.success])
Пример #8
0
 def test_isUnique(self):
     chk = Check(self.spark) \
         .isUnique('_1')
     out = self.suite.onData(self.df).addCheck(chk).run()
     out = DataFrame(out, self.spark).select('constraint_status').collect()
     self.assertEqual(out, [self.failure])
Пример #9
0
 def test_isInInterval(self):
     chk = Check(self.spark) \
                .isInInterval('_5', 1.0, 50.0)
     out = self.suite.onData(self.df).addCheck(chk).run()
     out = DataFrame(out, self.spark).select('constraint_status').collect()
     self.assertEqual(out, [self.success])
Пример #10
0
 def test_isGreaterThanOrEqualTo(self):
     chk = Check(self.spark) \
                .isGreaterThanOrEqualTo('_5', '_2', lambda x: x == 0.125)
     out = self.suite.onData(self.df).addCheck(chk).run()
     out = DataFrame(out, self.spark).select('constraint_status').collect()
     self.assertEqual(out, [self.success])
Пример #11
0
 def test_isNonNegative(self):
     chk = Check(self.spark) \
                .isNonNegative('_2')
     out = self.suite.onData(self.df).addCheck(chk).run()
     out = DataFrame(out, self.spark).select('constraint_status').collect()
     self.assertEqual(out, [self.success])
Пример #12
0
 def test_satisfies(self):
     chk = Check(self.spark) \
                .satisfies("_2 > 15", "MyCondition", lambda x: x == 0.25)
     out = self.suite.onData(self.df).addCheck(chk).run()
     out = DataFrame(out, self.spark).select('constraint_status').collect()
     self.assertEqual(out, [self.success])
Пример #13
0
 def test_hasStandardDeviation(self):
     chk = Check(self.spark) \
                .hasStandardDeviation('_2', lambda x: x > 5)
     out = self.suite.onData(self.df).addCheck(chk).run()
     out = DataFrame(out, self.spark).select('constraint_status').collect()
     self.assertEqual(out, [self.success])
Пример #14
0
 def createCheck(self,
                 level: CheckLevel,
                 description: str,
                 constraints=None):
     return Check(self._spark_session, level, description, constraints)