def test_inspect_with_both_schema_formats(self): # arrange schema_json = DataLoader.load_schema_with_name("schema_diff_json.json") schema_avro = DataLoader.load_schema_with_name("schema_diff_avro.json") samples = DataLoader.load_samples() # act result_json = self.inspector.inspect( samples, SchemaDefinition.create(schema_json, False)) result_avro = self.inspector.inspect( samples, SchemaDefinition.create(schema_avro, False)) # assert self.assertEqual(result_json, result_avro)
def test_inspect_with_multiple_expectations_asyncapi_style_json(self): # arrange schema = DataLoader.load_schema_with_name( "schema_expectation_asyncapi_style_json.json") samples = [ { 'random_integer': 1, 'random_string': 'id_1' }, { 'random_integer': 2, 'random_string': 'foo' }, # no match (string) { 'random_integer': 3, 'random_string': 'id_3' }, { 'random_integer': 4, 'random_string': 'id_4' }, # no match (integer) { 'random_integer': 5, 'random_string': 'foo' }, # no match (integer, string) ] # act metrics = self.inspector.inspect( samples, SchemaDefinition.create(schema, False)) # assert self.assertAlmostEqual(6 / 10, metrics.attribute_integrity, 3)
def expand_schema(cls, types: [], required_types: [] = None, expectations: dict = None) -> SchemaDefinition: if required_types is None: required_types = [] if expectations is None: expectations = {} properties = "" for name, t in types: type_description = f'"{t}"' if name in required_types else f'["{t}", "null"]' properties += '{"name": "' + name + '","type": ' + type_description if name in expectations.keys(): for key, value in expectations[name].items(): inner_value = f'"{value}"' if type(value) is str else value properties += f', "{key}": {inner_value}' properties += '},' schema = r''' { "type": "record", "name": "RandomMessage", "namespace": "data.producer.random", "fields": [ #PROPERTIES# ] } ''' schema_content = schema \ .replace("#PROPERTIES#", properties.rstrip(',')) return SchemaDefinition.create(schema_content, False)
def test_convert_with_unsupported_property(self): # arrange schema = ''' { "fields": [ { "name": "random_string", "type": "string", "foobar": "id_" } ] } ''' expected_schema = ''' { "fields": [ { "name": "random_string", "type": "string", "foobar": "id_" } ] } ''' # act result = self.parser.convert_expectations( SchemaDefinition.create(schema, False)) # assert self.assertStingEqualAsDict(result.schema_content, expected_schema)
def test_convert_with_multiple_expectations(self): # arrange schema = ''' { "fields": [ { "name": "random_integer", "type": "integer", "minimum": 0, "maximum": 3 }, { "name": "random_string", "type": "string", "pattern": "id_" } ] } ''' expected_schema = ''' { "fields": [ { "name": "random_integer", "type": "integer", "minimum": 0, "maximum": 3, "expectations": [ { "kwargs": { "min_value": 0, "max_value": 3 }, "expectation_type": "expect_column_values_to_be_between" } ] }, { "name": "random_string", "type": "string", "pattern": "id_", "expectations": [ { "kwargs": { "regex": "id_" }, "expectation_type": "expect_column_values_to_match_regex" } ] } ] } ''' # act result = self.parser.convert_expectations( SchemaDefinition.create(schema, False)) # assert self.assertStingEqualAsDict(result.schema_content, expected_schema)
def test_create_avro_parser(self): # arrange definition = SchemaDefinition.create(DataLoader.load_schema(), False) # act parser = SchemaParserFactory.create(definition) # assert self.assertIsInstance(parser, AvroSchemaParser)
def test_inspect_inferred(self) -> None: # arrange samples = DataLoader.load_samples() # act schema_definition = SchemaDefinition.create(DataLoader.load_schema()) result = self.inspector.inspect(samples, schema_definition) # assert self.assertEqual(1.0, result.attribute_integrity) self.assertEqual(.0, result.attribute_specification) self.assertEqual(.5, result.attribute_quality_index)
def convert_expectations(self, schema_obj: SchemaDefinition) -> SchemaDefinition: """ Method for converting the supported specifications to great expectation format """ if schema_obj.is_empty(): return schema_obj content = schema_obj.get_content() if self.get_property_keyword() not in content: return schema_obj self.convert_expectations_recursive(content) return SchemaDefinition.create(json.dumps(content), schema_obj.is_schema_inferred)
def test_inspect_with_inferred_schemas(self): # arrange schema = DataLoader.load_schema_with_name("schema_registry_json.json") schema_definition = SchemaDefinition.create(schema, True) samples = DataLoader.load_samples() # act result = self.inspector.inspect(samples, schema_definition) # assert self.assertEqual(1.0, result.attribute_integrity) self.assertEqual(.0, result.attribute_specification) self.assertEqual(.5, result.attribute_quality_index)
def test_convert_with_nested_expectations(self): # arrange schema = DataLoader.load_schema_with_name( "schema_nested_expectation_json.json") expected_schema = DataLoader.load_schema_with_name( "schema_nested_expectation_result_json.json") # act result = self.parser.convert_expectations( SchemaDefinition.create(schema, False)) # assert self.assertStingEqualAsDict(result.schema_content, expected_schema)
def test_various_types_do_not_throw_exceptions(self): # arrange schema = ''' { "type": "record", "name": "RandomData", "namespace": "data.producer.random", "fields": [ { "name": "random_string", "type": "string" }, { "name": "random_integer", "type": "int" }, { "name": "random_float", "type": "float" }, { "name": "random_boolean", "type": "boolean" } ] } ''' samples = [{ 'random_string': 'wheyuugkwi', 'random_integer': 876, 'random_float': 0.2295482, 'random_boolean': False }] # act metrics = self.inspector.inspect( samples, SchemaDefinition.create(schema, False)) # assert self.assertIsNotNone(metrics)
def test_integrity_for_complex_type(self): # arrange schema = DataLoader.load_schema_with_name("schema_registry_avro.json") samples = [ { "timestamp": 1595601702, "iss_position": { "longitude": "-42.2948", "latitude": "-40.3670" }, "message": "success" }, { "timestamp": 1595601702, "iss_position": { "latitude": "-40.3670" }, "message": "success" }, { "timestamp": "wrong", "iss_position": { "longitude": 666, "latitude": "-40.0283" }, "message": "success" }, ] # act result = self.inspector.inspect_attributes( samples, SchemaDefinition.create(schema, False)) # assert - only message is not mandatory so 3 out of 12 (3*4) are missing or wrong invalid_elements = 3 all_elements = 12 expected_integrity = (all_elements - invalid_elements) / all_elements self.assertAlmostEqual( expected_integrity, result.attribute_integrity, 3, f"Integrity must be {expected_integrity * 100}%")
def test_specification_from_toeggelomat(self): # arrange samples = DataLoader.load_samples_from_file("samples_toeggelomat.json") # act schema = DataLoader.load_schema_with_name("schema_toeggelomat.json") result = self.inspector.inspect(samples, SchemaDefinition.create(schema, False)) # assert self.assertEqual(53, len(result.attribute_details.keys()), "There should be 53 keys in the dictionary") for attribute_metric in result.attribute_details.keys(): self.assertEqual( 1.0, result.attribute_details[attribute_metric]. attribute_specification, f"Attribute specification must be 100% ({attribute_metric})") self.assertEqual( 1.0, result.attribute_details[attribute_metric].attribute_integrity, f"Attribute integrity must be 100% ({attribute_metric})")
def test_convert(self): # arrange schema = ''' { "properties": { "random_string": { "type": "string", "minimum": 0, "maximum": 10 } } } ''' expected_schema = ''' { "properties": { "random_string": { "type": "string", "minimum": 0, "maximum": 10, "expectations": [ { "kwargs": { "min_value": 0, "max_value": 10 }, "expectation_type": "expect_column_values_to_be_between" } ] } } } ''' # act result = self.parser.convert_expectations( SchemaDefinition.create(schema, False)) # assert self.assertStingEqualAsDict(result.schema_content, expected_schema)
def InspectQuality(self, request_iterator, context: typing.Any) -> QualityMetrics: all_samples = list() schema = None first_samples = None is_schema_inferred = True for batch in request_iterator: samples = json.loads(batch.samples_json) all_samples.append(samples) schema = batch.schema_json if not first_samples: is_schema_inferred = batch.is_schema_inferred first_samples = samples message = profiler_pb2.InspectionDataStreamResponse() inspector = QualityInspector() schema_definition = SchemaDefinition.create(schema, is_schema_inferred) try: result = inspector.inspect(all_samples, schema_definition) message.metric.attribute_quality_index = result.attribute_quality_index message.metric.attribute_integrity = result.attribute_integrity message.metric.attribute_specification = result.attribute_specification for k, v in result.attribute_details.items(): message.metric.attribute_details.append( domain_pb2.AttributeDetail( name=k, integrity=v.attribute_integrity, specification=v.attribute_specification, quality_index=v.attribute_quality_index)) except Exception as e: logging.error( f'Exception in inspection of quality (samples: {first_samples}, schema: {schema}, exception: {e})' ) message.error.message = repr(e) message.error.type = domain_pb2.InspectionError.Type.Value( 'UNKNOWN') return message
def test_convert_with_min_expectation(self): # arrange schema = ''' { "fields": [ { "name": "random_integer", "type": "integer", "minimum": 0 } ] } ''' expected_schema = ''' { "fields": [ { "name": "random_integer", "type": "integer", "minimum": 0, "expectations": [ { "kwargs": { "min_value": 0 }, "expectation_type": "expect_column_values_to_be_between" } ] } ] } ''' # act result = self.parser.convert_expectations( SchemaDefinition.create(schema, False)) # assert self.assertStingEqualAsDict(result.schema_content, expected_schema)
def test_specification_with_partial_schema_and_inferred(self) -> None: # arrange samples = [{ "random_int": 1, "random_string": "foo" }, { "random_int": 2, "random_string": "bar" }] schema = ''' { "$schema": "http://json-schema.org/schema#", "type": "object", "properties": { "random_string": { "type": "string" }, "random_integer": { "type": "integer" } }, "required": [ "random_integer", "random_string" ] } ''' schema_definition = SchemaDefinition.create(schema, True) # act result = self.inspector.inspect(samples, schema_definition) # assert self.assertEqual( .0, result.attribute_specification, "Attribute specification is considered 0% when the schema is inferred" )
def test_inspect_with_min_max_range_expectation(self): # arrange schema = DataLoader.load_schema_with_name("schema_with_min_max.json") samples = [ { 'random_integer': 3 }, { 'random_integer': 11 }, { 'random_integer': 3 }, { 'random_integer': 8 }, { 'random_integer': 3 }, { 'random_integer': -5 }, { 'random_integer': 3 }, { 'random_integer': 10 }, ] # act metrics = self.inspector.inspect( samples, SchemaDefinition.create(schema, False)) # assert self.assertEqual((6 / 8), metrics.attribute_integrity, f"Attribute integrity must be {(6 / 8) * 100}%")
def test_integrity_on_attribute_level_with_expectations(self): # arrange schema = ''' { "type": "record", "name": "RandomData", "namespace": "data.producer.random", "fields": [ { "name": "random_integer", "type": "int", "expectations": [ { "kwargs": { "min_value": 0, "max_value": 10 }, "expectation_type": "expect_column_values_to_be_between" } ] }, { "name": "random_string", "type": "string", "expectations": [ { "kwargs": { "regex": "id_" }, "meta": {}, "expectation_type": "expect_column_values_to_match_regex" } ] } ] } ''' samples = [ { 'random_integer': 1, 'random_string': 'missing_id' }, { 'random_integer': 11, 'random_string': 'id_1' }, { 'random_integer': 3, 'random_string': 'missing_id' }, ] # act result = self.inspector.inspect(samples, SchemaDefinition.create(schema, False)) # assert attribute_details = result.attribute_details self.assertAlmostEqual((3 / 6), result.attribute_integrity, 3, "Attribute integrity is not correct") self.assertTrue('random_integer' in attribute_details.keys(), "Missing integrity for attribute random_integer") self.assertTrue('random_string' in attribute_details.keys(), "Missing integrity for attribute random_string") self.assertAlmostEqual( (2 / 3), attribute_details['random_integer'].attribute_integrity, 3, "Integrity of random_int is not correct") self.assertAlmostEqual( (1 / 3), attribute_details['random_string'].attribute_integrity, 3, "Integrity of random_string is not correct")