def test_inspect_inferred(self) -> None: # arrange samples = DataLoader.load_samples() # act schema_definition = SchemaDefinition.create(DataLoader.load_schema()) result = self.inspector.inspect(samples, schema_definition) # assert self.assertEqual(1.0, result.attribute_integrity) self.assertEqual(.0, result.attribute_specification) self.assertEqual(.5, result.attribute_quality_index)
def test_inspect_with_inferred_schemas(self): # arrange schema = DataLoader.load_schema_with_name("schema_registry_json.json") schema_definition = SchemaDefinition.create(schema, True) samples = DataLoader.load_samples() # act result = self.inspector.inspect(samples, schema_definition) # assert self.assertEqual(1.0, result.attribute_integrity) self.assertEqual(.0, result.attribute_specification) self.assertEqual(.5, result.attribute_quality_index)
def test_convert_with_nested_expectations(self): # arrange schema = DataLoader.load_schema_with_name( "schema_nested_expectation_json.json") expected_schema = DataLoader.load_schema_with_name( "schema_nested_expectation_result_json.json") # act result = self.parser.convert_expectations( SchemaDefinition.create(schema, False)) # assert self.assertStingEqualAsDict(result.schema_content, expected_schema)
def test_inspect_with_both_schema_formats(self): # arrange schema_json = DataLoader.load_schema_with_name("schema_diff_json.json") schema_avro = DataLoader.load_schema_with_name("schema_diff_avro.json") samples = DataLoader.load_samples() # act result_json = self.inspector.inspect( samples, SchemaDefinition.create(schema_json, False)) result_avro = self.inspector.inspect( samples, SchemaDefinition.create(schema_avro, False)) # assert self.assertEqual(result_json, result_avro)
def test_integrity_on_attribute_level_with_not_specified_partial_field( self) -> None: # arrange samples = [ { "random_int": 1002, "random_string": 1 }, { "random_int": 1003, "random_string": 2 }, { "random_int": 1004 }, ] schema_definition = DataLoader.expand_schema( [("random_int", "integer")], []) # act result = self.inspector.inspect_attributes(samples, schema_definition) # assert attribute_details = result.attribute_details self.assertTrue('random_string' in attribute_details.keys(), "Missing integrity for attribute random_string") self.assertAlmostEqual( 1, attribute_details['random_string'].attribute_integrity, 3, "Integrity of random_string is not correct")
def test_specification_on_attribute_level_with_partial_expectations( self) -> None: # arrange samples = [ { "random_int": 1002, "random_string": 1 }, { "random_int": 1003, "random_string": 2 }, ] schema_definition = DataLoader.expand_schema( [("random_int", "integer"), ("random_string", "string")], [], {"random_int": { "minimum": 0 }}) # act result = self.inspector.inspect_attributes(samples, schema_definition) # assert attribute_details = result.attribute_details self.assertTrue('random_int' in attribute_details.keys()) self.assertEqual( .75, attribute_details['random_int'].attribute_specification) self.assertEqual( .5, attribute_details['random_string'].attribute_specification)
def test_specification_on_attribute_level_with_missing_specification( self) -> None: # arrange samples = [ { "random_int": 1002, "random_string": 1 }, { "random_int": 1003, "random_string": 2 }, ] schema_definition = DataLoader.expand_schema( [("random_int", "integer")], []) # act result = self.inspector.inspect_attributes(samples, schema_definition) # assert attribute_details = result.attribute_details self.assertTrue('random_string' in attribute_details.keys()) self.assertEqual( 0.0, attribute_details['random_string'].attribute_specification)
def test_inspect_with_non_unique_types_does_not_throw_exception( self) -> None: # arrange samples = [ { "random_int": 1002 }, { "random_int": "1003" }, { "random_int": "1004" }, ] schema_definition = DataLoader.expand_schema( [("random_int", "integer")], [], {"random_int": { "minimum": 0, "maximum": 100 }}) # act result = self.inspector.inspect(samples, schema_definition) # assert attribute_details = result.attribute_details self.assertAlmostEquals( (1 / 3), attribute_details['random_int'].attribute_integrity, 3)
def test_inspect_with_missing_field(self): # arrange samples = [ { "random_other": "other" }, ] schema_definition = DataLoader.expand_schema( [("random_int", "integer")], ["random_int"]) # act result = self.inspector.inspect_attributes(samples, schema_definition) # assert expected_specification = (0 + 1) / 2 expected_integrity = (1 + 0) / 2 self.assertEqual(expected_specification, result.attribute_specification, "Attribute specification is not correct") self.assertEqual(expected_integrity, result.attribute_integrity, "Attribute integrity is not correct") self.assertEqual((expected_specification + expected_integrity) / 2, result.attribute_quality_index, "Attribute quality is not correct")
def test_quality_with_complete_specification(self) -> None: # arrange samples = [ { "random_int": 1, "random_string": "foo" }, # random_string does not match { "random_int": 2, "random_string": "bar" } ] schema_definition = DataLoader.expand_schema( [("random_string", "string"), ("random_int", "number")], [], { "random_string": { "pattern": "bar" }, "random_int": { "minimum": 0, "maximum": 100 } }) # act result = self.inspector.inspect_attributes(samples, schema_definition) # assert self.assertEqual(.75, result.attribute_integrity) self.assertEqual(1.0, result.attribute_specification) self.assertEqual(.875, result.attribute_quality_index)
def test_inspect_with_multiple_expectations_asyncapi_style_json(self): # arrange schema = DataLoader.load_schema_with_name( "schema_expectation_asyncapi_style_json.json") samples = [ { 'random_integer': 1, 'random_string': 'id_1' }, { 'random_integer': 2, 'random_string': 'foo' }, # no match (string) { 'random_integer': 3, 'random_string': 'id_3' }, { 'random_integer': 4, 'random_string': 'id_4' }, # no match (integer) { 'random_integer': 5, 'random_string': 'foo' }, # no match (integer, string) ] # act metrics = self.inspector.inspect( samples, SchemaDefinition.create(schema, False)) # assert self.assertAlmostEqual(6 / 10, metrics.attribute_integrity, 3)
def test_create_avro_parser(self): # arrange definition = SchemaDefinition.create(DataLoader.load_schema(), False) # act parser = SchemaParserFactory.create(definition) # assert self.assertIsInstance(parser, AvroSchemaParser)
def test_integrity_with_wrong_type(self) -> None: # arrange samples, schema = DataLoader.create_dummy_samples() # noinspection PyTypeChecker samples[0]['random_string'] = 123 # act result = self.inspector.inspect_attributes(samples, schema) # assert self.assertEqual(0.5, result.attribute_integrity)
def test_integrity_with_missing_not_required(self) -> None: # arrange samples = [{"random_int": 1}, {"random_int": None}, {"random_int": 2}] schema_definition = DataLoader.expand_schema( [("random_int", "integer")], []) # act result = self.inspector.inspect_attributes(samples, schema_definition) # assert self.assertEqual(1.0, result.attribute_integrity)
def test_specification_from_toeggelomat(self): # arrange samples = DataLoader.load_samples_from_file("samples_toeggelomat.json") # act schema = DataLoader.load_schema_with_name("schema_toeggelomat.json") result = self.inspector.inspect(samples, SchemaDefinition.create(schema, False)) # assert self.assertEqual(53, len(result.attribute_details.keys()), "There should be 53 keys in the dictionary") for attribute_metric in result.attribute_details.keys(): self.assertEqual( 1.0, result.attribute_details[attribute_metric]. attribute_specification, f"Attribute specification must be 100% ({attribute_metric})") self.assertEqual( 1.0, result.attribute_details[attribute_metric].attribute_integrity, f"Attribute integrity must be 100% ({attribute_metric})")
def test_integrity_with_float_as_int(self) -> None: # arrange samples = [{"random_int": "10000001.023"}] schema_definition = DataLoader.expand_schema( [("random_int", "integer")], []) # act result = self.inspector.inspect_attributes(samples, schema_definition) # assert self.assertEqual(0.0, result.attribute_integrity)
def test_integrity_without_provided_schema(self) -> None: # arrange samples, _ = DataLoader.create_dummy_samples() # act empty_schema = SchemaDefinition.empty() result = self.inspector.inspect(samples, empty_schema) # assert self.assertEqual(1.0, result.attribute_integrity) self.assertEqual(.0, result.attribute_specification) self.assertEqual(.5, result.attribute_quality_index)
def test_integrity_without_specified_required_field(self) -> None: # arrange samples = [{"random_int": 1}, {"random_int": 2}, {"random_int": 3}] schema_definition = DataLoader.expand_schema( [("random_int", "integer"), ("random_string", "string")], ["random_string"]) # act result = self.inspector.inspect_attributes(samples, schema_definition) # assert self.assertEqual(.5, result.attribute_integrity)
def test_load_required_types_for_deeply_nested_schema(self): # arrange schema = DataLoader.load_schema_with_name( "schema_registry_avro_complex.json") schema_obj = json.loads(schema) # act type_definitions, _ = self.parser.load_required_types_from_schema( schema_obj) # assert self.assertListEqual([ "complex/subtypeString", "complex/subtypeComplex/subtypeNumber", "simpleNumber" ], type_definitions)
def test_integrity_with_negative_as_string(self) -> None: # arrange samples = [{"random_int": "-10000"}] schema_definition = DataLoader.expand_schema( [("random_int", "integer")], []) # act result = self.inspector.inspect_attributes(samples, schema_definition) # assert self.assertEqual( .0, result.attribute_integrity, "Attribute integrity must be 0% (even if not required, a " "specified value needs to be correct).")
def test_load_required_types_for_deeply_nested_schema(self): # arrange schema = DataLoader.load_schema_with_name( "schema_inferred_complex.json") schema_obj = json.loads(schema) # act type_definitions, _ = JsonSchemaParser( ).load_required_types_from_schema(schema_obj) # assert # assert self.assertListEqual(type_definitions, [ "base", "complex/type1number", "complex/type3complex/subtype1number" ])
def test_quality_without_specification(self): # arrange samples = [{ "random_int": 1, "random_string": "foo" }, { "random_int": 2, "random_string": "bar" }] schema_definition = DataLoader.expand_schema([], []) # act result = self.inspector.inspect_attributes(samples, schema_definition) # assert self.assertEqual(.5, result.attribute_quality_index)
def test_inspect_with_unspecified_field(self): # arrange samples = [ { "random_int": 1 }, ] schema_definition = DataLoader.expand_schema([], []) # act result = self.inspector.inspect_attributes(samples, schema_definition) # assert self.assertEqual(0, result.attribute_specification) self.assertEqual(1, result.attribute_integrity) self.assertEqual(.5, result.attribute_quality_index)
def test_specification_with_complete_specification(self) -> None: # arrange samples = [{ "random_int": 1, "random_string": "foo" }, { "random_int": 2, "random_string": "bar" }] schema_definition = DataLoader.expand_schema( [("random_int", "integer"), ("random_string", "string")], ["random_string", "random_int"], ) # act result = self.inspector.inspect_attributes(samples, schema_definition) # assert self.assertEqual(.5, result.attribute_specification)
def test_quality_with_partial_specification(self) -> None: # arrange samples = [{ "random_int": 1, "random_string": "foo" }, { "random_int": 2, "random_string": "bar" }] schema_definition = DataLoader.expand_schema( [("random_string", "string"), ("random_int", "int")], []) # act result = self.inspector.inspect_attributes(samples, schema_definition) # assert self.assertEqual(1.0, result.attribute_integrity) self.assertEqual(.5, result.attribute_specification) self.assertEqual(.75, result.attribute_quality_index)
def test_integrity_for_complex_type(self): # arrange schema = DataLoader.load_schema_with_name("schema_registry_avro.json") samples = [ { "timestamp": 1595601702, "iss_position": { "longitude": "-42.2948", "latitude": "-40.3670" }, "message": "success" }, { "timestamp": 1595601702, "iss_position": { "latitude": "-40.3670" }, "message": "success" }, { "timestamp": "wrong", "iss_position": { "longitude": 666, "latitude": "-40.0283" }, "message": "success" }, ] # act result = self.inspector.inspect_attributes( samples, SchemaDefinition.create(schema, False)) # assert - only message is not mandatory so 3 out of 12 (3*4) are missing or wrong invalid_elements = 3 all_elements = 12 expected_integrity = (all_elements - invalid_elements) / all_elements self.assertAlmostEqual( expected_integrity, result.attribute_integrity, 3, f"Integrity must be {expected_integrity * 100}%")
def test_specification_with_partial_specification(self) -> None: # arrange samples = [{ "random_int": 1, "random_string": "foo" }, { "random_int": 2, "random_string": "bar" }] schema_definition = DataLoader.expand_schema( [("random_string", "string")], []) # act result = self.inspector.inspect_attributes(samples, schema_definition) # assert (half of the data is specified to .5) self.assertEqual( .25, result.attribute_specification, "Specification must be 25% because only half of the data is specified in schema" )
def test_specification_with_irrelevant_specification(self) -> None: # arrange samples = [{ "random_int": 1, "random_string": "foo" }, { "random_int": 2, "random_string": "bar" }] schema_definition = DataLoader.expand_schema( [("random_other", "string")], []) # act result = self.inspector.inspect_attributes(samples, schema_definition) # assert self.assertEqual( 0, result.attribute_specification, "Specification must be 0% because none of the attributes are specified" )
def test_integrity_with_additional_field(self) -> None: # arrange samples = [{ "random_int": 1, "random_string": "abc" }, { "random_int": 2, "random_string": "efg" }, { "random_int": 3, "random_string": "hij" }] schema_definition = DataLoader.expand_schema( [("random_int", "integer")], []) # act result = self.inspector.inspect_attributes(samples, schema_definition) # assert self.assertEqual(1.0, result.attribute_integrity)
def test_integrity_on_attribute_level_with_missing_value(self) -> None: # arrange samples = [ { "random_int": 1002, "random_string": 1 }, { "random_int": 1003, "random_string": 2 }, { "random_int": "foo", "random_string": 3 }, { "random_int": 1005, "random_string": "fourth" }, ] schema_definition = DataLoader.expand_schema( [("random_int", "integer"), ("random_string", "string")], []) # act result = self.inspector.inspect_attributes(samples, schema_definition) # assert attribute_details = result.attribute_details self.assertTrue('random_int' in attribute_details.keys(), "Missing integrity for attribute random_int") self.assertTrue('random_string' in attribute_details.keys(), "Missing integrity for attribute random_string") self.assertAlmostEqual( (3 / 4), attribute_details['random_int'].attribute_integrity, 3, "Integrity of random_int is not correct") self.assertAlmostEqual( (1 / 4), attribute_details['random_string'].attribute_integrity, 3, "Integrity of random_string is not correct")