def test_accumulated_report(self): check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2") reporter = EmailReporter("*****@*****.**", {"*****@*****.**"}, accumulatedReport=True) check.run([reporter]) reporter.sendAccumulatedReport() reporter.sendAccumulatedReport("111")
def test_output(self): check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2") baos = ByteArrayOutputStream() reporter = ConsoleReporter(baos) check.run([reporter]) expected_output = """ \x1b[34mChecking [_1: bigint, _2: string]\x1b[0m \x1b[34mIt has a total number of 2 columns and 3 rows.\x1b[0m \x1b[31m- Column _1 is not a key (1 non-unique tuple).\x1b[0m \x1b[32m- Columns _1, _2 are a key.\x1b[0m """.strip() self.assertEqual(baos.get_output(), expected_output)
def test_output(self): check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2") baos = ByteArrayOutputStream() reporter = MarkdownReporter(baos) check.run([reporter]) expected_output = """ **Checking [_1: bigint, _2: string]** It has a total number of 2 columns and 3 rows. - *FAILURE*: Column _1 is not a key (1 non-unique tuple). - *SUCCESS*: Columns _1, _2 are a key. """.strip() self.assertEqual(baos.get_output(), expected_output)
def test_satisfies(self): df = self.spark.createDataFrame([(1, "a"), (2, "a"), (3, "a")]) check = Check(df).satisfies("_1 > 0").satisfies(df._2 == 'a') check.run([self.reporter]) expected_output = """ **Checking [_1: bigint, _2: string]** It has a total number of 2 columns and 3 rows. - *SUCCESS*: Constraint _1 > 0 is satisfied. - *SUCCESS*: Constraint (_2 = a) is satisfied. """.strip() self.assertEqual(self.reporter.output_stream.get_output(), expected_output)
def test_hasUniqueKey(self): df = self.spark.createDataFrame([(1, "a"), (1, None), (3, "c")]) check = Check(df).hasUniqueKey("_1").hasUniqueKey("_1", "_2") check.run([self.reporter]) expected_output = """ **Checking [_1: bigint, _2: string]** It has a total number of 2 columns and 3 rows. - *FAILURE*: Column _1 is not a key (1 non-unique tuple). - *SUCCESS*: Columns _1, _2 are a key. """.strip() self.assertEqual(self.reporter.output_stream.get_output(), expected_output)
def test_isAnyOf(self): df = self.spark.createDataFrame([(1, "a"), (2, "b"), (3, "c")]) check = Check(df).isAnyOf("_1", [1, 2]).isAnyOf("_2", ["a", "b", "c"]) check.run([self.reporter]) expected_output = """ **Checking [_1: bigint, _2: string]** It has a total number of 2 columns and 3 rows. - *FAILURE*: Column _1 contains 1 row that is not in Set(1, 2). - *SUCCESS*: Column _2 contains only values in Set(a, b, c). """.strip() self.assertEqual(self.reporter.output_stream.get_output(), expected_output)
def test_isNeverNull(self): df = self.spark.createDataFrame([(1, "a"), (1, None), (3, "c")]) check = Check(df).isNeverNull("_1").isNeverNull("_2") check.run([self.reporter]) expected_output = """ **Checking [_1: bigint, _2: string]** It has a total number of 2 columns and 3 rows. - *SUCCESS*: Column _1 is never null. - *FAILURE*: Column _2 contains 1 row that is null (should never be null). """.strip() self.assertEqual(self.reporter.output_stream.get_output(), expected_output)
def test_hasNumRowsLessThan(self): df = self.spark.createDataFrame([(1, "a"), (1, None), (3, "c")]) check = Check(df).hasNumRowsLessThan(2).hasNumRowsLessThan(10) check.run([self.reporter]) expected_output = """ **Checking [_1: bigint, _2: string]** It has a total number of 2 columns and 3 rows. - *FAILURE*: The actual number of rows 3 does not satisfy (count < 2). - *SUCCESS*: The number of rows satisfies (count < 10). """.strip() self.assertEqual(self.reporter.output_stream.get_output(), expected_output)
def test_hasFunctionalDependency(self): df = self.spark.createDataFrame([(1, 2, 1, 1), (9, 9, 9, 2), (9, 9, 9, 3)]) check = Check(df).hasFunctionalDependency(["_1", "_2"], ["_3"]) check.run([self.reporter]) expected_output = """ **Checking [_1: bigint, _2: bigint ... 2 more fields]** It has a total number of 4 columns and 3 rows. - *SUCCESS*: Column _3 is functionally dependent on _1, _2. """.strip() self.assertEqual(self.reporter.output_stream.get_output(), expected_output)
def test_passed_arguments(self): check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2") smtpServer = "*****@*****.**" to = {"*****@*****.**"} cc = {"*****@*****.**"} subjectPrefix = "my subject prefix: " smtpPort = 9000 from_ = "test.ddq.io" usernameAndPassword = ("username", "password") reportOnlyOnFailure = True accumulatedReport = True reporter = EmailReporter(smtpServer, to, cc, subjectPrefix, smtpPort, from_, usernameAndPassword, reportOnlyOnFailure, accumulatedReport) check.run([reporter])
def test_isFormattedAsDate(self): df = self.spark.createDataFrame([("2000-11-23 11:50:10", ), ("2000-5-23 11:50:10", ), ("2000-02-23 11:11:11", )]) check = Check(df).isFormattedAsDate("_1", "yyyy-MM-dd HH:mm:ss") check.run([self.reporter]) expected_output = """ **Checking [_1: string]** It has a total number of 1 columns and 3 rows. - *SUCCESS*: Column _1 is formatted by yyyy-MM-dd HH:mm:ss. """.strip() self.assertEqual(self.reporter.output_stream.get_output(), expected_output)
def test_isJoinableWith(self): base = self.spark.createDataFrame([(1, 2, 3), (1, 2, 5), (1, 3, 3)]) ref = self.spark.createDataFrame([(1, 2, 100), (1, 3, 100)]) columnTuple1 = ("_1", "_1") columnTuple2 = ("_2", "_2") check = Check(base).isJoinableWith(ref, columnTuple1, columnTuple2) check.run([self.reporter]) expected_output = """ **Checking [_1: bigint, _2: bigint ... 1 more field]** It has a total number of 3 columns and 3 rows. - *SUCCESS*: Key _1->_1, _2->_2 can be used for joining. Join columns cardinality in base table: 2. Join columns cardinality after joining: 2 (100.00%). """.strip() self.assertEqual(self.reporter.output_stream.get_output(), expected_output)
def test_hasForeignKey(self): base = self.spark.createDataFrame([(1, 2, 3), (1, 2, 5), (1, 3, 3)]) ref = self.spark.createDataFrame([(1, 2, 100), (1, 3, 100)]) columnTuple1 = ("_1", "_1") columnTuple2 = ("_2", "_2") check = Check(base).hasForeignKey(ref, columnTuple1, columnTuple2) check.run([self.reporter]) expected_output = """ **Checking [_1: bigint, _2: bigint ... 1 more field]** It has a total number of 3 columns and 3 rows. - *SUCCESS*: Columns _1->_1, _2->_2 define a foreign key pointing to the reference table [_1: bigint, _2: bigint ... 1 more field]. """.strip() self.assertEqual(self.reporter.output_stream.get_output(), expected_output)
def test_isNeverNull(self): df = self.sqlContext.createDataFrame([(1, "a"), (1, None), (3, "c")]) check = Check(df).isNeverNull("_1").isNeverNull("_2") check.run([self.reporter]) expected_output = """ **Checking [_1: bigint, _2: string]** It has a total number of 2 columns and 3 rows. - *SUCCESS*: Column _1 is never null. - *FAILURE*: Column _2 contains 1 row that is null (should never be null). """.strip() self.assertEqual( self.reporter.output_stream.get_output(), expected_output )
def test_passed_arguments(self): check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2") smtpServer = "*****@*****.**" to = {"*****@*****.**"} cc = {"*****@*****.**"} subjectPrefix = "my subject prefix: " smtpPort = 9000 from_ = "test.ddq.io" usernameAndPassword = ("username", "password") reportOnlyOnFailure = True accumulatedReport = True reporter = EmailReporter( smtpServer, to, cc, subjectPrefix, smtpPort, from_, usernameAndPassword, reportOnlyOnFailure, accumulatedReport ) check.run([reporter])
def test_passed_args(self): display_name = "display name" id = "id" cache_method = StorageLevel.DISK_ONLY check = Check(self.df, display_name, cache_method, id) # check wrapper self.assertEqual(check.name, display_name) self.assertEqual(check.id, id) self.assertEqual(check.cacheMethod, cache_method) # check jvm check self.assertEqual(check.jvmCheck.getClass().toString(), "class de.frosner.ddq.core.Check") self.assertEqual(check.jvmCheck.name(), check.name) self.assertEqual(check.jvmCheck.id(), check.id) jvm_cache_method = check.jvmCheck.cacheMethod().get() self.assertEqual(jvm_cache_method.useDisk(), check.cacheMethod.useDisk) self.assertEqual(jvm_cache_method.useMemory(), check.cacheMethod.useMemory) self.assertEqual(jvm_cache_method.useOffHeap(), check.cacheMethod.useOffHeap) self.assertEqual(jvm_cache_method.deserialized(), check.cacheMethod.deserialized) self.assertEqual(jvm_cache_method.replication(), check.cacheMethod.replication)
def test_isAnyOf(self): df = self.sqlContext.createDataFrame([(1, "a"), (2, "b"), (3, "c")]) check = Check(df).isAnyOf("_1", [1, 2]).isAnyOf("_2", ["a", "b", "c"]) check.run([self.reporter]) expected_output = """ **Checking [_1: bigint, _2: string]** It has a total number of 2 columns and 3 rows. - *FAILURE*: Column _1 contains 1 row that is not in Set(1, 2). - *SUCCESS*: Column _2 contains only values in Set(a, b, c). """.strip() self.assertEqual( self.reporter.output_stream.get_output(), expected_output )
def test_hasUniqueKey(self): df = self.sqlContext.createDataFrame([(1, "a"), (1, None), (3, "c")]) check = Check(df).hasUniqueKey("_1").hasUniqueKey("_1", "_2") check.run([self.reporter]) expected_output = """ **Checking [_1: bigint, _2: string]** It has a total number of 2 columns and 3 rows. - *FAILURE*: Column _1 is not a key (1 non-unique tuple). - *SUCCESS*: Columns _1, _2 are a key. """.strip() self.assertEqual( self.reporter.output_stream.get_output(), expected_output )
def test_isConvertibleTo(self): df = self.spark.createDataFrame([(1, "a"), (1, None), (3, "c")]) check = Check(df)\ .isConvertibleTo("_1", t.IntegerType())\ .isConvertibleTo("_1", t.ArrayType(t.IntegerType())) check.run([self.reporter]) expected_output = """ **Checking [_1: bigint, _2: string]** It has a total number of 2 columns and 3 rows. - *SUCCESS*: Column _1 can be converted from LongType to IntegerType. - *ERROR*: Checking whether column _1 can be converted to ArrayType(IntegerType,true) failed: org.apache.spark.sql.AnalysisException: cannot resolve '`_1`' due to data type mismatch: cannot cast LongType to ArrayType(IntegerType,true);; 'Project [_1#477L, cast(_1#477L as array<int>) AS _1_casted#516]\n+- LogicalRDD [_1#477L, _2#478] """.strip() self.assertEqual(self.reporter.output_stream.get_output(), expected_output)
def test_isConvertibleTo(self): df = self.sqlContext.createDataFrame([(1, "a"), (1, None), (3, "c")]) check = Check(df)\ .isConvertibleTo("_1", t.IntegerType())\ .isConvertibleTo("_1", t.ArrayType(t.IntegerType())) check.run([self.reporter]) expected_output = """ **Checking [_1: bigint, _2: string]** It has a total number of 2 columns and 3 rows. - *SUCCESS*: Column _1 can be converted from LongType to IntegerType. - *ERROR*: Checking whether column _1 can be converted to ArrayType(IntegerType,true) failed: org.apache.spark.sql.AnalysisException: cannot resolve 'cast(_1 as array<int>)' due to data type mismatch: cannot cast LongType to ArrayType(IntegerType,true); """.strip() self.assertEqual( self.reporter.output_stream.get_output(), expected_output )
def test_satisfies(self): df = self.sqlContext.createDataFrame([ (1, "a"), (2, "a"), (3, "a") ]) check = Check(df).satisfies("_1 > 0").satisfies("_2 = 'a'") check.run([self.reporter]) expected_output = """ **Checking [_1: bigint, _2: string]** It has a total number of 2 columns and 3 rows. - *SUCCESS*: Constraint _1 > 0 is satisfied. - *SUCCESS*: Constraint _2 = 'a' is satisfied. """.strip() self.assertEqual( self.reporter.output_stream.get_output(), expected_output )
def test_isEqualTo(self): df1 = self.spark.createDataFrame([(1, 2, 3), (1, 2, 5), (1, 3, 3)]) df2 = self.spark.createDataFrame([(1, 2, 3), (1, 2, 5), (1, 3, 3)]) df3 = self.spark.createDataFrame([(1, 2, 3), (1, 2, 5), (9, 9, 9), (10, 10, 10)]) expected_output = """ **Checking [_1: bigint, _2: bigint ... 1 more field]** It has a total number of 3 columns and 3 rows. - *SUCCESS*: It is equal to [_1: bigint, _2: bigint ... 1 more field]. - *FAILURE*: It is not equal (1 distinct count row is present in the checked dataframe but not in the other and 2 distinct count rows are present in the other dataframe but not in the checked one) to [_1: bigint, _2: bigint ... 1 more field]. """.strip() check = Check(df1).isEqualTo(df2).isEqualTo(df3) check.run([self.reporter]) self.assertEqual(self.reporter.output_stream.get_output(), expected_output)
def test_hasFunctionalDependency(self): df = self.sqlContext.createDataFrame([ (1, 2, 1, 1), (9, 9, 9, 2), (9, 9, 9, 3) ]) check = Check(df).hasFunctionalDependency(["_1", "_2"], ["_3"]) check.run([self.reporter]) expected_output = """ **Checking [_1: bigint, _2: bigint, _3: bigint, _4: bigint]** It has a total number of 4 columns and 3 rows. - *SUCCESS*: Column _3 is functionally dependent on _1, _2. """.strip() self.assertEqual( self.reporter.output_stream.get_output(), expected_output )
def test_isAlwaysNull(self): schema = t.StructType([ t.StructField("_1", t.IntegerType()), t.StructField("_2", t.StringType()), ]) df = self.spark.createDataFrame([(1, None), (1, None), (3, None)], schema) check = Check(df).isAlwaysNull("_1").isAlwaysNull("_2") check.run([self.reporter]) expected_output = """ **Checking [_1: int, _2: string]** It has a total number of 2 columns and 3 rows. - *FAILURE*: Column _1 contains 3 non-null rows (should always be null). - *SUCCESS*: Column _2 is always null. """.strip() self.assertEqual(self.reporter.output_stream.get_output(), expected_output)
def test_isMatchingRegex(self): df = self.spark.createDataFrame([("Hello A", "world"), ("Hello B", None), ("Hello C", "World")]) check = Check(df)\ .isMatchingRegex("_1", "^Hello")\ .isMatchingRegex("_2", "world$") check.run([self.reporter]) expected_output = """ **Checking [_1: string, _2: string]** It has a total number of 2 columns and 3 rows. - *SUCCESS*: Column _1 matches ^Hello - *FAILURE*: Column _2 contains 1 row that does not match world$ """.strip() self.assertEqual(self.reporter.output_stream.get_output(), expected_output)
def test_isFormattedAsDate(self): df = self.sqlContext.createDataFrame([ ("2000-11-23 11:50:10", ), ("2000-5-23 11:50:10", ), ("2000-02-23 11:11:11", ) ]) check = Check(df).isFormattedAsDate("_1", "yyyy-MM-dd HH:mm:ss") check.run([self.reporter]) expected_output = """ **Checking [_1: string]** It has a total number of 1 columns and 3 rows. - *SUCCESS*: Column _1 is formatted by yyyy-MM-dd HH:mm:ss. """.strip() self.assertEqual( self.reporter.output_stream.get_output(), expected_output )
def test_default_args(self): df = Mock() check = Check(df) ddq_check = check._jvm.de.frosner.ddq.core.Check ddq_check.assert_called_with( df._jdf, getattr(ddq_check, "apply$default$2")(), getattr(ddq_check, "apply$default$3")(), getattr(ddq_check, "apply$default$4")(), getattr(ddq_check, "apply$default$5")(), )
def test_default_args(self): check = Check(self.df) self.assertEqual(check.name, "DataFrame[_1: bigint, _2: string]") self.assertEqual(check.cacheMethod, None) try: UUID(check.id, version=4) except ValueError: raise self.fail("id is not a correct uuid4") self.assertEqual(check.jvmCheck.getClass().toString(), "class de.frosner.ddq.core.Check")
def test_output(self): with patch("pyddq.reporters.get_field") as get_field: baos = ByteArrayOutputStream() baos.jvm = self.df._sc._jvm get_field.return_value = baos.jvm_obj check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2") z = Mock() reporter = ZeppelinReporter(z) check.run([reporter]) expected_output = """ %html </p> <h4>Checking [_1: bigint, _2: string]</h4> <h5>It has a total number of 2 columns and 3 rows.</h5> <table> <tr><td style="padding:3px">❌</td><td style="padding:3px">Column _1 is not a key (1 non-unique tuple).</td></tr> <tr><td style="padding:3px">✅</td><td style="padding:3px">Columns _1, _2 are a key.</td></tr> </table> <p hidden> """.strip() self.assertEqual(baos.get_output(), expected_output)
def test_isJoinableWith(self): base = self.sqlContext.createDataFrame([ (1, 2, 3), (1, 2, 5), (1, 3, 3) ]) ref = self.sqlContext.createDataFrame([ (1, 2, 100), (1, 3, 100) ]) columnTuple1 = ("_1", "_1") columnTuple2 = ("_2", "_2") check = Check(base).isJoinableWith(ref, columnTuple1, columnTuple2) check.run([self.reporter]) expected_output = """ **Checking [_1: bigint, _2: bigint, _3: bigint]** It has a total number of 3 columns and 3 rows. - *SUCCESS*: Key _1->_1, _2->_2 can be used for joining. Join columns cardinality in base table: 2. Join columns cardinality after joining: 2 (100.00%). """.strip() self.assertEqual( self.reporter.output_stream.get_output(), expected_output )
def test_isConvertibleTo(self): df = self.spark.createDataFrame([(1, "a"), (1, None), (3, "c")]) check = Check(df)\ .isConvertibleTo("_1", t.IntegerType())\ .isConvertibleTo("_1", t.ArrayType(t.IntegerType())) check.run([self.reporter]) # instance ids are in the output expected_output = """ **Checking [_1: bigint, _2: string]** It has a total number of 2 columns and 3 rows. - *SUCCESS*: Column _1 can be converted from LongType to IntegerType. - *ERROR*: Checking whether column _1 can be converted to ArrayType(IntegerType,true) failed: org.apache.spark.sql.AnalysisException: cannot resolve '`_1`' due to data type mismatch: cannot cast LongType to ArrayType(IntegerType,true);; 'Project [ +- LogicalRDD [ """.strip() for actual, expected in zip( self.reporter.output_stream.get_output().split("\n"), expected_output.split("\n")): self.assertTrue(actual.startswith(expected))
def test_hasForeignKey(self): base = self.sqlContext.createDataFrame([ (1, 2, 3), (1, 2, 5), (1, 3, 3) ]) ref = self.sqlContext.createDataFrame([ (1, 2, 100), (1, 3, 100) ]) columnTuple1 = ("_1", "_1") columnTuple2 = ("_2", "_2") check = Check(base).hasForeignKey(ref, columnTuple1, columnTuple2) check.run([self.reporter]) expected_output = """ **Checking [_1: bigint, _2: bigint, _3: bigint]** It has a total number of 3 columns and 3 rows. - *SUCCESS*: Columns _1->_1, _2->_2 define a foreign key pointing to the reference table [_1: bigint, _2: bigint, _3: bigint]. """.strip() self.assertEqual( self.reporter.output_stream.get_output(), expected_output )
def test_isAlwaysNull(self): schema = t.StructType([ t.StructField("_1", t.IntegerType()), t.StructField("_2", t.StringType()), ]) df = self.sqlContext.createDataFrame( [(1, None), (1, None), (3, None)], schema ) check = Check(df).isAlwaysNull("_1").isAlwaysNull("_2") check.run([self.reporter]) expected_output = """ **Checking [_1: int, _2: string]** It has a total number of 2 columns and 3 rows. - *FAILURE*: Column _1 contains 3 non-null rows (should always be null). - *SUCCESS*: Column _2 is always null. """.strip() self.assertEqual( self.reporter.output_stream.get_output(), expected_output )
def test_isMatchingRegex(self): df = self.sqlContext.createDataFrame([ ("Hello A", "world"), ("Hello B", None), ("Hello C", "World") ]) check = Check(df)\ .isMatchingRegex("_1", "^Hello")\ .isMatchingRegex("_2", "world$") check.run([self.reporter]) expected_output = """ **Checking [_1: string, _2: string]** It has a total number of 2 columns and 3 rows. - *SUCCESS*: Column _1 matches ^Hello - *FAILURE*: Column _2 contains 1 row that does not match world$ """.strip() self.assertEqual( self.reporter.output_stream.get_output(), expected_output )
def test_passed_args(self): df = Mock() display_name = Mock() cache_method = Mock() id = Mock() df._sc._jvm.scala.Some.apply = Mock( side_effect=["Some(displayName)", "Some(cacheMethod)"] ) check = Check(df, display_name, cache_method, id) ddq_check = check._jvm.de.frosner.ddq.core.Check ddq_check.assert_called_with( df._jdf, "Some(displayName)", "Some(cacheMethod)", getattr(ddq_check, "apply$default$4")(), id )
def test_default_arguments(self): check = Check(self.df).hasUniqueKey("_1").hasUniqueKey("_1", "_2") reporter = EmailReporter("*****@*****.**", {"*****@*****.**"}) check.run([reporter])
def setUp(self): self.check = Check(Mock()) self.jvmCheck = self.check.jvmCheck
rawZoneInputPath = rawZoneAdlsPath + \ 'WorldWideImporters/orders/2020/04/07/*.parquet' exporationZoneOutputPath = explorationZoneAdlsPath + \ 'deta-lake/WorldWideImporters/orders' # Configure the Session that will connect python to spark and configure logging for spark delta store configuration spark = SparkSession \ .builder \ .master("local[*]") \ .enableHiveSupport() \ .config("spark.driver.bindAddress", "127.0.0.1") \ .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \ .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.AzureLogStore") \ .getOrCreate() test = spark.read.format("delta").load(exporationZoneOutputPath) test.dtypes Check(test) \ .hasNumRowsGreaterThan(0) \ .hasUniqueKey("OrderID") \ .isNeverNull("CustomerID") \ .run()
def setUp(self): df = get_df() self.check = Check(df) self.jvmCheck = self.check.jvmCheck
class ConstraintTest(unittest.TestCase): COLUMN_NAME = "column name" def setUp(self): df = get_df() self.check = Check(df) self.jvmCheck = self.check.jvmCheck def test_hasUniqueKey(self): column_names = ["a", "b"] jvm_column_names = Mock() self.check._jvm.scala.collection.JavaConversions.\ iterableAsScalaIterable().toList = Mock( return_value=jvm_column_names ) self.check.hasUniqueKey(self.COLUMN_NAME, column_names) self.jvmCheck.hasUniqueKey.assert_called_with( self.COLUMN_NAME, jvm_column_names ) def test_hasNumRowsEqualTo(self): num_rows = 10 self.check.hasNumRowsEqualTo(num_rows) self.check._jvm.de.frosner.ddq.constraints.NumberOfRowsConstraint.\ equalTo.assert_called_with(num_rows) def test_hasNumRowsGreaterThan(self): num_rows = 10 self.check.hasNumRowsGreaterThan(num_rows) self.check._jvm.de.frosner.ddq.constraints.NumberOfRowsConstraint.\ greaterThan.assert_called_with(num_rows) def test_hasNumRowsLessThan(self): num_rows = 10 self.check.hasNumRowsLessThan(num_rows) self.check._jvm.de.frosner.ddq.constraints.NumberOfRowsConstraint.\ lessThan.assert_called_with(num_rows) def test_isNeverNull(self): self.check.isNeverNull(self.COLUMN_NAME) self.jvmCheck.isNeverNull.assert_called_with(self.COLUMN_NAME) def test_isAlwaysNull(self): self.check.isAlwaysNull(self.COLUMN_NAME) self.jvmCheck.isAlwaysNull.assert_called_with(self.COLUMN_NAME) def test_isConvertibleTo(self): target_type = Mock() target_type.json = Mock(return_value="json value") jvm_type = Mock() self.check._jvm.org.apache.spark.sql.types.DataType.fromJson = Mock( return_value=jvm_type ) self.check.isConvertibleTo(self.COLUMN_NAME, target_type) target_type.json.assert_called() self.check._jvm.org.apache.spark.sql.types.DataType.fromJson.\ assert_called_with("json value") self.jvmCheck.isConvertibleTo.assert_called_with( self.COLUMN_NAME, jvm_type ) def test_isFormattedAsDate(self): date_format = "yyyy-MM-dd HH:mm:ss" self.check.isFormattedAsDate(self.COLUMN_NAME, date_format) self.jvmCheck.isFormattedAsDate.assert_called_with(self.COLUMN_NAME, date_format) def test_isAnyOf(self): allowed = ("a", "b", "c") jvm_allowed = Mock() self.check._jvm.scala.collection.JavaConversions.\ iterableAsScalaIterable().toSet = Mock( return_value=jvm_allowed ) self.check.isAnyOf(self.COLUMN_NAME, allowed) self.jvmCheck.isAnyOf.assert_called_with(self.COLUMN_NAME, jvm_allowed) def test_isMatchingRegex(self): regex = "^regex$" self.check.isMatchingRegex(self.COLUMN_NAME, regex) self.jvmCheck.isMatchingRegex.assert_called_with(self.COLUMN_NAME, regex) def test_hasFunctionalDepdendency(self): determinant_set = ["column1", "column2"] dependent_set = ["column3", "column4"] jvm_determinant_set = Mock() jvm_dependent_set = Mock() self.check._jvm.scala.collection.JavaConversions.\ iterableAsScalaIterable().toList = Mock( side_effect=[jvm_determinant_set, jvm_dependent_set] ) self.check.hasFunctionalDependency(determinant_set, dependent_set) self.jvmCheck.hasFunctionalDependency.assert_called_with( jvm_determinant_set, jvm_dependent_set ) def test_hasForeignKey(self): key_map1 = ("_1", "_1") key_map2 = ("_1", "_2") ref = Mock() jvm_key_map1 = Mock() jvm_key_map2 = Mock() self.check._jvm.scala.Tuple2 = Mock( side_effect=[jvm_key_map1, jvm_key_map2] ) self.check._jvm.scala.collection.JavaConversions.\ iterableAsScalaIterable().toList = Mock( return_value=[jvm_key_map2] ) self.check.hasForeignKey(ref, key_map1, key_map2) self.jvmCheck.hasForeignKey.assert_called_with( ref._jdf, jvm_key_map1, [jvm_key_map2] ) def test_isJoinableWith(self): key_map1 = ("_1", "_1") key_map2 = ("_1", "_2") ref = Mock() jvm_key_map1 = Mock() jvm_key_map2 = Mock() self.check._jvm.scala.Tuple2 = Mock( side_effect=[jvm_key_map1, jvm_key_map2] ) self.check._jvm.scala.collection.JavaConversions.\ iterableAsScalaIterable().toList = Mock( return_value=[jvm_key_map2] ) self.check.isJoinableWith(ref, key_map1, key_map2) self.jvmCheck.isJoinableWith.assert_called_with( ref._jdf, jvm_key_map1, [jvm_key_map2] ) def test_satisfies(self): constraint = "_1 > 10" self.check.satisfies(constraint) self.jvmCheck.satisfies.assert_called_with(constraint) def test_isEqualTo(self): df2 = Mock() self.check.isEqualTo(df2) self.jvmCheck.isEqualTo.assert_called_with(df2._jdf)
class ConstraintTest(unittest.TestCase): COLUMN_NAME = "column name" def setUp(self): df = get_df() self.check = Check(df) self.jvmCheck = self.check.jvmCheck def test_hasUniqueKey(self): column_names = ["a", "b"] jvm_column_names = Mock() self.check._jvm.scala.collection.JavaConversions.\ iterableAsScalaIterable().toList = Mock( return_value=jvm_column_names ) self.check.hasUniqueKey(self.COLUMN_NAME, column_names) self.jvmCheck.hasUniqueKey.assert_called_with( self.COLUMN_NAME, jvm_column_names ) def test_isNeverNull(self): self.check.isNeverNull(self.COLUMN_NAME) self.jvmCheck.isNeverNull.assert_called_with(self.COLUMN_NAME) def test_isAlwaysNull(self): self.check.isAlwaysNull(self.COLUMN_NAME) self.jvmCheck.isAlwaysNull.assert_called_with(self.COLUMN_NAME) def test_isConvertibleTo(self): target_type = Mock() target_type.json = Mock(return_value="json value") jvm_type = Mock() self.check._jvm.org.apache.spark.sql.types.DataType.fromJson = Mock( return_value=jvm_type ) self.check.isConvertibleTo(self.COLUMN_NAME, target_type) target_type.json.assert_called() self.check._jvm.org.apache.spark.sql.types.DataType.fromJson.\ assert_called_with("json value") self.jvmCheck.isConvertibleTo.assert_called_with( self.COLUMN_NAME, jvm_type ) def test_isFormattedAsDate(self): date_format = "yyyy-MM-dd HH:mm:ss" self.check.isFormattedAsDate(self.COLUMN_NAME, date_format) self.jvmCheck.isFormattedAsDate.assert_called_with(self.COLUMN_NAME, date_format) def test_isAnyOf(self): allowed = ("a", "b", "c") jvm_allowed = Mock() self.check._jvm.scala.collection.JavaConversions.\ iterableAsScalaIterable().toSet = Mock( return_value=jvm_allowed ) self.check.isAnyOf(self.COLUMN_NAME, allowed) self.jvmCheck.isAnyOf.assert_called_with(self.COLUMN_NAME, jvm_allowed) def test_isMatchingRegex(self): regex = "^regex$" self.check.isMatchingRegex(self.COLUMN_NAME, regex) self.jvmCheck.isMatchingRegex.assert_called_with(self.COLUMN_NAME, regex) def test_hasFunctionalDepdendency(self): determinant_set = ["column1", "column2"] dependent_set = ["column3", "column4"] jvm_determinant_set = Mock() jvm_dependent_set = Mock() self.check._jvm.scala.collection.JavaConversions.\ iterableAsScalaIterable().toList = Mock( side_effect=[jvm_determinant_set, jvm_dependent_set] ) self.check.hasFunctionalDependency(determinant_set, dependent_set) self.jvmCheck.hasFunctionalDependency.assert_called_with( jvm_determinant_set, jvm_dependent_set ) def test_hasForeignKey(self): key_map1 = ("_1", "_1") key_map2 = ("_1", "_2") ref = Mock() jvm_key_map1 = Mock() jvm_key_map2 = Mock() self.check._jvm.scala.Tuple2 = Mock( side_effect=[jvm_key_map1, jvm_key_map2] ) self.check._jvm.scala.collection.JavaConversions.\ iterableAsScalaIterable().toList = Mock( return_value=[jvm_key_map2] ) self.check.hasForeignKey(ref, key_map1, key_map2) self.jvmCheck.hasForeignKey.assert_called_with( ref._jdf, jvm_key_map1, [jvm_key_map2] ) def test_isJoinableWith(self): key_map1 = ("_1", "_1") key_map2 = ("_1", "_2") ref = Mock() jvm_key_map1 = Mock() jvm_key_map2 = Mock() self.check._jvm.scala.Tuple2 = Mock( side_effect=[jvm_key_map1, jvm_key_map2] ) self.check._jvm.scala.collection.JavaConversions.\ iterableAsScalaIterable().toList = Mock( return_value=[jvm_key_map2] ) self.check.isJoinableWith(ref, key_map1, key_map2) self.jvmCheck.isJoinableWith.assert_called_with( ref._jdf, jvm_key_map1, [jvm_key_map2] ) def test_satisfies(self): constraint = "_1 > 10" self.check.satisfies(constraint) self.jvmCheck.satisfies.assert_called_with(constraint) def test_isEqualTo(self): df2 = Mock() self.check.isEqualTo(df2) self.jvmCheck.isEqualTo.assert_called_with(df2._jdf)