Пример #1
0
    def create(cls, schema_definition: SchemaDefinition) -> AbstractSchemaParser:
        if schema_definition.is_avro():
            return AvroSchemaParser()
        if schema_definition.is_json():
            return JsonSchemaParser()

        assert False, "Parser not supported"
Пример #2
0
    def test_equality_true(self):
        # arrange
        schema = SchemaDefinition.empty()
        another_schema = SchemaDefinition.empty()

        # act / assert
        self.assertEqual(schema, another_schema)
Пример #3
0
    def test_equality_inferred(self):
        # arrange
        schema = SchemaDefinition.empty()
        another_schema = SchemaDefinition.empty()
        another_schema.is_schema_inferred = True

        # act / assert
        self.assertNotEqual(schema, another_schema)
Пример #4
0
    def test_equality_content(self):
        # arrange
        schema = SchemaDefinition.empty()
        another_schema = SchemaDefinition.empty()
        another_schema.schema_content = "another schema content"

        # act / assert
        self.assertNotEqual(schema, another_schema)
Пример #5
0
    def convert_expectations(self, schema_obj: SchemaDefinition) -> SchemaDefinition:
        """
        Method for converting the supported specifications to great expectation format
        """
        if schema_obj.is_empty():
            return schema_obj

        content = schema_obj.get_content()
        if self.get_property_keyword() not in content:
            return schema_obj

        self.convert_expectations_recursive(content)
        return SchemaDefinition.create(json.dumps(content), schema_obj.is_schema_inferred)
Пример #6
0
    def test_inspect_with_both_schema_formats(self):
        # arrange
        schema_json = DataLoader.load_schema_with_name("schema_diff_json.json")
        schema_avro = DataLoader.load_schema_with_name("schema_diff_avro.json")

        samples = DataLoader.load_samples()

        # act
        result_json = self.inspector.inspect(
            samples, SchemaDefinition.create(schema_json, False))
        result_avro = self.inspector.inspect(
            samples, SchemaDefinition.create(schema_avro, False))

        # assert
        self.assertEqual(result_json, result_avro)
Пример #7
0
    def test_inspect_with_multiple_expectations_asyncapi_style_json(self):
        # arrange
        schema = DataLoader.load_schema_with_name(
            "schema_expectation_asyncapi_style_json.json")

        samples = [
            {
                'random_integer': 1,
                'random_string': 'id_1'
            },
            {
                'random_integer': 2,
                'random_string': 'foo'
            },  # no match (string)
            {
                'random_integer': 3,
                'random_string': 'id_3'
            },
            {
                'random_integer': 4,
                'random_string': 'id_4'
            },  # no match (integer)
            {
                'random_integer': 5,
                'random_string': 'foo'
            },  # no match (integer, string)
        ]

        # act
        metrics = self.inspector.inspect(
            samples, SchemaDefinition.create(schema, False))

        # assert
        self.assertAlmostEqual(6 / 10, metrics.attribute_integrity, 3)
Пример #8
0
    def expand_schema(cls,
                      types: [],
                      required_types: [] = None,
                      expectations: dict = None) -> SchemaDefinition:
        if required_types is None:
            required_types = []
        if expectations is None:
            expectations = {}
        properties = ""
        for name, t in types:
            type_description = f'"{t}"' if name in required_types else f'["{t}", "null"]'
            properties += '{"name": "' + name + '","type": ' + type_description
            if name in expectations.keys():
                for key, value in expectations[name].items():
                    inner_value = f'"{value}"' if type(value) is str else value
                    properties += f', "{key}": {inner_value}'
            properties += '},'

        schema = r'''
                   {
                    "type": "record",
                    "name": "RandomMessage",
                    "namespace": "data.producer.random",
                    "fields": [
                        #PROPERTIES#
                    ]
                }
                   '''

        schema_content = schema \
            .replace("#PROPERTIES#", properties.rstrip(','))

        return SchemaDefinition.create(schema_content, False)
Пример #9
0
    def test_convert_with_unsupported_property(self):
        # arrange
        schema = '''
                       {
                           "fields": [
                                {
                                    "name": "random_string",
                                    "type": "string",
                                    "foobar": "id_"
                                }
                           ]
                       }
                       '''
        expected_schema = '''
                        {
                           "fields": [
                                {
                                    "name": "random_string",
                                    "type": "string",
                                    "foobar": "id_"
                                }
                           ]
                       }
                       '''

        # act
        result = self.parser.convert_expectations(
            SchemaDefinition.create(schema, False))

        # assert
        self.assertStingEqualAsDict(result.schema_content, expected_schema)
Пример #10
0
    def test_convert_with_multiple_expectations(self):
        # arrange
        schema = '''
        {
            "fields": [
                {
                    "name": "random_integer",
                    "type": "integer",
                    "minimum": 0,
                    "maximum": 3
                },
                {
                    "name": "random_string",
                    "type": "string",
                    "pattern": "id_"
                }
            ]
        }
        '''
        expected_schema = '''
        {
           "fields": [
               {
                   "name": "random_integer",
                   "type": "integer",
                   "minimum": 0,
                   "maximum": 3,
                   "expectations": [
                   {
                          "kwargs": {
                           "min_value": 0,
                           "max_value": 3
                         },
                         "expectation_type": "expect_column_values_to_be_between"
                       }
                   ]
               },
                {
                   "name": "random_string",
                   "type": "string",
                   "pattern": "id_",
                   "expectations": [
                   {
                         "kwargs": {
                           "regex": "id_"
                         },
                         "expectation_type": "expect_column_values_to_match_regex"
                       }
                   ]
               }
           ]
        }
        '''

        # act
        result = self.parser.convert_expectations(
            SchemaDefinition.create(schema, False))

        # assert
        self.assertStingEqualAsDict(result.schema_content, expected_schema)
    def test_create_avro_parser(self):
        # arrange
        definition = SchemaDefinition.create(DataLoader.load_schema(), False)

        # act
        parser = SchemaParserFactory.create(definition)

        # assert
        self.assertIsInstance(parser, AvroSchemaParser)
Пример #12
0
    def test_integrity_without_provided_schema(self) -> None:
        # arrange
        samples, _ = DataLoader.create_dummy_samples()

        # act
        empty_schema = SchemaDefinition.empty()
        result = self.inspector.inspect(samples, empty_schema)

        # assert
        self.assertEqual(1.0, result.attribute_integrity)
        self.assertEqual(.0, result.attribute_specification)
        self.assertEqual(.5, result.attribute_quality_index)
Пример #13
0
    def test_inspect_inferred(self) -> None:
        # arrange
        samples = DataLoader.load_samples()

        # act
        schema_definition = SchemaDefinition.create(DataLoader.load_schema())
        result = self.inspector.inspect(samples, schema_definition)

        # assert
        self.assertEqual(1.0, result.attribute_integrity)
        self.assertEqual(.0, result.attribute_specification)
        self.assertEqual(.5, result.attribute_quality_index)
Пример #14
0
    def test_inspect_with_inferred_schemas(self):
        # arrange
        schema = DataLoader.load_schema_with_name("schema_registry_json.json")
        schema_definition = SchemaDefinition.create(schema, True)
        samples = DataLoader.load_samples()

        # act
        result = self.inspector.inspect(samples, schema_definition)

        # assert
        self.assertEqual(1.0, result.attribute_integrity)
        self.assertEqual(.0, result.attribute_specification)
        self.assertEqual(.5, result.attribute_quality_index)
Пример #15
0
    def test_convert_with_nested_expectations(self):
        # arrange
        schema = DataLoader.load_schema_with_name(
            "schema_nested_expectation_json.json")
        expected_schema = DataLoader.load_schema_with_name(
            "schema_nested_expectation_result_json.json")

        # act
        result = self.parser.convert_expectations(
            SchemaDefinition.create(schema, False))

        # assert
        self.assertStingEqualAsDict(result.schema_content, expected_schema)
Пример #16
0
    def parse_schema(self, schema_definition: SchemaDefinition) -> SchemaParserResult:
        """
        Template method
        """

        # execute preprocessing step
        schema_definition = self.convert_expectations(schema_definition)

        schema_content = schema_definition.get_content()

        # load types from schema
        type_definitions, expectation_definitions = self.load_types_from_schema(schema_content)

        # load required types from schema
        required_types, type_expectations = self.load_required_types_from_schema(schema_content)

        return SchemaParserResult(type_definitions, expectation_definitions, required_types, type_expectations)
Пример #17
0
    def test_integrity_for_complex_type(self):
        # arrange
        schema = DataLoader.load_schema_with_name("schema_registry_avro.json")

        samples = [
            {
                "timestamp": 1595601702,
                "iss_position": {
                    "longitude": "-42.2948",
                    "latitude": "-40.3670"
                },
                "message": "success"
            },
            {
                "timestamp": 1595601702,
                "iss_position": {
                    "latitude": "-40.3670"
                },
                "message": "success"
            },
            {
                "timestamp": "wrong",
                "iss_position": {
                    "longitude": 666,
                    "latitude": "-40.0283"
                },
                "message": "success"
            },
        ]

        # act
        result = self.inspector.inspect_attributes(
            samples, SchemaDefinition.create(schema, False))

        # assert - only message is not mandatory so 3 out of 12 (3*4) are missing or wrong
        invalid_elements = 3
        all_elements = 12
        expected_integrity = (all_elements - invalid_elements) / all_elements
        self.assertAlmostEqual(
            expected_integrity, result.attribute_integrity, 3,
            f"Integrity must be {expected_integrity * 100}%")
Пример #18
0
    def test_various_types_do_not_throw_exceptions(self):
        # arrange
        schema = '''
        {
            "type": "record",
            "name": "RandomData",
            "namespace": "data.producer.random",
            "fields": [
                {
                    "name": "random_string",
                    "type": "string"
                },
                {
                    "name": "random_integer",
                    "type": "int"
                },
                {
                    "name": "random_float",
                    "type": "float"
                },
                {
                    "name": "random_boolean",
                    "type": "boolean"
                }
            ]
        }
        '''

        samples = [{
            'random_string': 'wheyuugkwi',
            'random_integer': 876,
            'random_float': 0.2295482,
            'random_boolean': False
        }]

        # act
        metrics = self.inspector.inspect(
            samples, SchemaDefinition.create(schema, False))

        # assert
        self.assertIsNotNone(metrics)
Пример #19
0
    def test_specification_from_toeggelomat(self):
        # arrange
        samples = DataLoader.load_samples_from_file("samples_toeggelomat.json")

        # act
        schema = DataLoader.load_schema_with_name("schema_toeggelomat.json")
        result = self.inspector.inspect(samples,
                                        SchemaDefinition.create(schema, False))

        # assert
        self.assertEqual(53, len(result.attribute_details.keys()),
                         "There should be 53 keys in the dictionary")
        for attribute_metric in result.attribute_details.keys():
            self.assertEqual(
                1.0, result.attribute_details[attribute_metric].
                attribute_specification,
                f"Attribute specification must be 100% ({attribute_metric})")
            self.assertEqual(
                1.0,
                result.attribute_details[attribute_metric].attribute_integrity,
                f"Attribute integrity must be 100% ({attribute_metric})")
Пример #20
0
    def test_convert(self):
        # arrange
        schema = '''
               {
                    "properties": {
                        "random_string": {
                            "type": "string",
                            "minimum": 0,
                            "maximum": 10
                        }
                    }
                }
               '''
        expected_schema = '''
               {
                    "properties": {
                        "random_string": {
                            "type": "string",
                            "minimum": 0,
                            "maximum": 10,
                          "expectations": [
                              {
                                 "kwargs": {
                                  "min_value": 0,
                                  "max_value": 10
                                },
                                "expectation_type": "expect_column_values_to_be_between"
                              }
                          ]
                      }
                  }
               }
               '''

        # act
        result = self.parser.convert_expectations(
            SchemaDefinition.create(schema, False))

        # assert
        self.assertStingEqualAsDict(result.schema_content, expected_schema)
Пример #21
0
    def InspectQuality(self, request_iterator,
                       context: typing.Any) -> QualityMetrics:

        all_samples = list()
        schema = None
        first_samples = None
        is_schema_inferred = True
        for batch in request_iterator:
            samples = json.loads(batch.samples_json)
            all_samples.append(samples)
            schema = batch.schema_json
            if not first_samples:
                is_schema_inferred = batch.is_schema_inferred
                first_samples = samples

        message = profiler_pb2.InspectionDataStreamResponse()

        inspector = QualityInspector()
        schema_definition = SchemaDefinition.create(schema, is_schema_inferred)
        try:
            result = inspector.inspect(all_samples, schema_definition)
            message.metric.attribute_quality_index = result.attribute_quality_index
            message.metric.attribute_integrity = result.attribute_integrity
            message.metric.attribute_specification = result.attribute_specification
            for k, v in result.attribute_details.items():
                message.metric.attribute_details.append(
                    domain_pb2.AttributeDetail(
                        name=k,
                        integrity=v.attribute_integrity,
                        specification=v.attribute_specification,
                        quality_index=v.attribute_quality_index))
        except Exception as e:
            logging.error(
                f'Exception in inspection of quality (samples: {first_samples}, schema: {schema}, exception: {e})'
            )
            message.error.message = repr(e)
            message.error.type = domain_pb2.InspectionError.Type.Value(
                'UNKNOWN')

        return message
Пример #22
0
    def test_convert_with_min_expectation(self):
        # arrange
        schema = '''
        {
            "fields": [
                {
                    "name": "random_integer",
                    "type": "integer",
                    "minimum": 0
                }
            ]
        }
        '''
        expected_schema = '''
        {
           "fields": [
               {
                   "name": "random_integer",
                   "type": "integer",
                   "minimum": 0,
                   "expectations": [
                       {
                          "kwargs": {
                           "min_value": 0
                         },
                         "expectation_type": "expect_column_values_to_be_between"
                       }
                   ]
               }
           ]
        }
        '''

        # act
        result = self.parser.convert_expectations(
            SchemaDefinition.create(schema, False))

        # assert
        self.assertStingEqualAsDict(result.schema_content, expected_schema)
Пример #23
0
    def test_specification_with_partial_schema_and_inferred(self) -> None:
        # arrange
        samples = [{
            "random_int": 1,
            "random_string": "foo"
        }, {
            "random_int": 2,
            "random_string": "bar"
        }]

        schema = '''
               {
                   "$schema": "http://json-schema.org/schema#",
                   "type": "object",
                   "properties": {
                       "random_string": {
                           "type": "string"
                       },
                       "random_integer": {
                           "type": "integer"
                       }
                   },
                   "required": [
                       "random_integer",
                       "random_string"
                   ]
               }
               '''
        schema_definition = SchemaDefinition.create(schema, True)

        # act
        result = self.inspector.inspect(samples, schema_definition)

        # assert
        self.assertEqual(
            .0, result.attribute_specification,
            "Attribute specification is considered 0% when the schema is inferred"
        )
Пример #24
0
    def test_inspect_with_min_max_range_expectation(self):
        # arrange
        schema = DataLoader.load_schema_with_name("schema_with_min_max.json")

        samples = [
            {
                'random_integer': 3
            },
            {
                'random_integer': 11
            },
            {
                'random_integer': 3
            },
            {
                'random_integer': 8
            },
            {
                'random_integer': 3
            },
            {
                'random_integer': -5
            },
            {
                'random_integer': 3
            },
            {
                'random_integer': 10
            },
        ]

        # act
        metrics = self.inspector.inspect(
            samples, SchemaDefinition.create(schema, False))

        # assert
        self.assertEqual((6 / 8), metrics.attribute_integrity,
                         f"Attribute integrity must be {(6 / 8) * 100}%")
Пример #25
0
    def test_integrity_on_attribute_level_with_expectations(self):
        # arrange
        schema = '''
                {
                    "type": "record",
                    "name": "RandomData",
                    "namespace": "data.producer.random",
                    "fields": [
                        {
                            "name": "random_integer",
                            "type": "int",
                            "expectations": [
                                {       
                                   "kwargs": {
                                    "min_value": 0,
                                    "max_value": 10
                                  },
                                  "expectation_type": "expect_column_values_to_be_between"
                                }
                            ]
                        },
                         {
                            "name": "random_string",
                            "type": "string",
                            "expectations": [
                            {       
                                  "kwargs": {
                                    "regex": "id_"
                                  },
                                  "meta": {},
                                  "expectation_type": "expect_column_values_to_match_regex"
                                }
                            ]
                        }
                    ]
                }
                '''

        samples = [
            {
                'random_integer': 1,
                'random_string': 'missing_id'
            },
            {
                'random_integer': 11,
                'random_string': 'id_1'
            },
            {
                'random_integer': 3,
                'random_string': 'missing_id'
            },
        ]

        # act
        result = self.inspector.inspect(samples,
                                        SchemaDefinition.create(schema, False))

        # assert
        attribute_details = result.attribute_details
        self.assertAlmostEqual((3 / 6), result.attribute_integrity, 3,
                               "Attribute integrity is not correct")
        self.assertTrue('random_integer' in attribute_details.keys(),
                        "Missing integrity for attribute random_integer")
        self.assertTrue('random_string' in attribute_details.keys(),
                        "Missing integrity for attribute random_string")
        self.assertAlmostEqual(
            (2 / 3), attribute_details['random_integer'].attribute_integrity,
            3, "Integrity of random_int is not correct")

        self.assertAlmostEqual(
            (1 / 3), attribute_details['random_string'].attribute_integrity, 3,
            "Integrity of random_string is not correct")