Пример #1
0
def test_avro_schema_to_mce_fields_record_with_two_fields():
    schema = """
{
  "type": "record",
  "name": "some.event.name",
  "namespace": "not.relevant.namespace",
  "fields": [
    {
      "name": "a",
      "type": "string",
      "doc": "some.doc"
    },
    {
      "name": "b",
      "type": "string",
      "doc": "some.doc"
    }
  ]
}
"""
    fields = avro_schema_to_mce_fields(schema)
    expected_field_paths = [
        "[version=2.0].[type=name].[type=string].a",
        "[version=2.0].[type=name].[type=string].b",
    ]
    assert_field_paths_match(fields, expected_field_paths)
Пример #2
0
    def test_avro_schema_to_mce_fields_sample_events_with_different_field_types(self):

        EXAMPLES = [SCHEMA_WITH_MAP_TYPE_FIELD]

        for schema in EXAMPLES:
            fields = avro_schema_to_mce_fields(schema)
            self.assertEqual(1, len(fields))
Пример #3
0
def test_simple_record_with_primitive_types():
    schema = """
    {
        "type": "record",
        "name": "Simple",
        "namespace": "com.linkedin",
        "fields": [
            {"name": "stringField", "type": "string", "doc": "string field"},
            {"name": "booleanField", "type": "boolean" },
            {"name": "intField", "type": "int" },
            {
                "name": "enumField",
                "type": {
                    "type": "enum",
                    "name": "MyTestEnumField",
                    "symbols": [ "TEST", "TEST1" ],
                    "symbolDoc": {
                        "TEST": "test enum",
                        "TEST1": "test1 enum"
                    }
                }
            }
        ]
    }
"""
    fields = avro_schema_to_mce_fields(schema)
    expected_field_paths = [
        "[version=2.0].[type=Simple].[type=string].stringField",
        "[version=2.0].[type=Simple].[type=boolean].booleanField",
        "[version=2.0].[type=Simple].[type=int].intField",
        "[version=2.0].[type=Simple].[type=enum].enumField",
    ]
    assert_field_paths_match(fields, expected_field_paths)
Пример #4
0
def test_avro_schema_to_mce_fields_with_nesting_across_records():
    schema = """
[
    {
        "type": "record",
        "name": "Address",
        "fields": [
            {"name": "streetAddress", "type": "string"},
            {"name": "city", "type": "string"}
        ]
    },
    {
        "type": "record",
        "name": "Person",
        "fields": [
            {"name": "firstname", "type": "string"},
            {"name": "lastname", "type": "string" },
            {"name": "address", "type": "Address"}
        ]
    }
]
"""
    fields = avro_schema_to_mce_fields(schema)
    expected_field_paths = [
        "[version=2.0].[type=union]",
        "[version=2.0].[type=union].[type=Address].[type=string].streetAddress",
        "[version=2.0].[type=union].[type=Address].[type=string].city",
        "[version=2.0].[type=union].[type=Person].[type=string].firstname",
        "[version=2.0].[type=union].[type=Person].[type=string].lastname",
        "[version=2.0].[type=union].[type=Person].[type=Address].address",
    ]
    assert_field_paths_match(fields, expected_field_paths)
Пример #5
0
        def get_schema_fields_for_column(
            self, dataset_name: str, column: dict, pk_constraints: dict = None
        ) -> List[SchemaField]:

            fields = super().get_schema_fields_for_column(
                dataset_name, column, pk_constraints
            )

            if isinstance(column["type"], (datatype.ROW, sqltypes.ARRAY, datatype.MAP)):
                assert len(fields) == 1
                field = fields[0]
                # Get avro schema for subfields along with parent complex field
                avro_schema = self.get_avro_schema_from_data_type(
                    column["type"], column["name"]
                )

                newfields = schema_util.avro_schema_to_mce_fields(
                    json.dumps(avro_schema), default_nullable=True
                )

                # First field is the parent complex field
                newfields[0].nullable = field.nullable
                newfields[0].description = field.description
                newfields[0].isPartOfKey = field.isPartOfKey
                return newfields

            return fields
Пример #6
0
def test_simple_nested_record_with_a_string_field_for_key_schema():
    schema = """
    {
        "type": "record",
        "name": "SimpleNested",
        "namespace": "com.linkedin",
        "fields": [{
            "name": "nestedRcd",
            "type": {
                "type": "record",
                "name": "InnerRcd",
                "fields": [{
                    "name": "aStringField",
                     "type": "string"
                } ]
            }
        }]
    }
"""
    fields = avro_schema_to_mce_fields(schema, True)
    expected_field_paths: List[str] = [
        "[version=2.0].[key=True].[type=SimpleNested].[type=InnerRcd].nestedRcd",
        "[version=2.0].[key=True].[type=SimpleNested].[type=InnerRcd].nestedRcd.[type=string].aStringField",
    ]
    assert_field_paths_match(fields, expected_field_paths)
Пример #7
0
def test_nested_arrays():
    schema = """
{
    "type": "record",
    "name": "NestedArray",
    "namespace": "com.linkedin",
    "fields": [{
        "name": "ar",
        "type": {
            "type": "array",
            "items": {
                "type": "array",
                "items": [
                    "null",
                    {
                        "type": "record",
                        "name": "Foo",
                        "fields": [ {
                            "name": "a",
                            "type": "long"
                        } ]
                    }
                ]
            }
        }
    } ]
}
"""
    fields = avro_schema_to_mce_fields(schema)
    expected_field_paths: List[str] = [
        "[version=2.0].[type=NestedArray].[type=array].[type=array].[type=Foo].ar",
        "[version=2.0].[type=NestedArray].[type=array].[type=array].[type=Foo].ar.[type=long].a",
    ]
    assert_field_paths_match(fields, expected_field_paths)
Пример #8
0
def test_logical_types():
    schema: str = """
{
    "type": "record",
    "name": "test_logical_types",
    "fields":  [
        {"name": "decimal_logical", "type": "bytes", "logicalType": "decimal", "precision": 4, "scale": 2},
        {"name": "uuid_logical", "type": "string", "logicalType": "uuid"},
        {"name": "date_logical", "type": "int", "logicalType": "date"},
        {"name": "time_millis_logical", "type": "int", "logicalType": "time-millis"},
        {"name": "time_micros_logical", "type": "long", "logicalType": "time-micros"},
        {"name": "timestamp_millis_logical", "type": "long", "logicalType": "timestamp-millis"},
        {"name": "timestamp_micros_logical", "type": "long", "logicalType": "timestamp-micros"}
    ]
}
    """
    fields: List[SchemaField] = avro_schema_to_mce_fields(schema,
                                                          is_key_schema=False)
    expected_field_paths: List[str] = [
        "[version=2.0].[type=test_logical_types].[type=bytes].decimal_logical",
        "[version=2.0].[type=test_logical_types].[type=string].uuid_logical",
        "[version=2.0].[type=test_logical_types].[type=int].date_logical",
        "[version=2.0].[type=test_logical_types].[type=int].time_millis_logical",
        "[version=2.0].[type=test_logical_types].[type=long].time_micros_logical",
        "[version=2.0].[type=test_logical_types].[type=long].timestamp_millis_logical",
        "[version=2.0].[type=test_logical_types].[type=long].timestamp_micros_logical",
    ]
    assert_field_paths_match(fields, expected_field_paths)
Пример #9
0
    def test_avro_schema_to_mce_fields_toplevel_isnt_a_record(self):

        examples = [SCHEMA_WITH_TOP_LEVEL_PRIMITIVE_FIELD]

        for schema in examples:
            fields = avro_schema_to_mce_fields(schema)
            self.assertEqual(1, len(fields))
Пример #10
0
def test_map_of_union_of_int_and_record_of_union():
    schema = """
    {
        "type": "record",
        "name": "MapSample",
        "namespace": "com.linkedin",
        "fields": [{
            "name": "aMap",
            "type": {
                "type": "map",
                "values": [
                    "int",
                    {
                        "type": "record",
                        "name": "Rcd",
                        "fields": [{
                            "name": "aUnion",
                            "type": ["null", "string", "int"]
                        }]
                    }
               ]
            }
        }]
    }
"""
    fields = avro_schema_to_mce_fields(schema)
    expected_field_paths = [
        "[version=2.0].[type=MapSample].[type=map].[type=union].aMap",
        "[version=2.0].[type=MapSample].[type=map].[type=union].[type=int].aMap",
        "[version=2.0].[type=MapSample].[type=map].[type=union].[type=Rcd].aMap",
        "[version=2.0].[type=MapSample].[type=map].[type=union].[type=Rcd].aMap.[type=union].aUnion",
        "[version=2.0].[type=MapSample].[type=map].[type=union].[type=Rcd].aMap.[type=union].[type=string].aUnion",
        "[version=2.0].[type=MapSample].[type=map].[type=union].[type=Rcd].aMap.[type=union].[type=int].aUnion",
    ]
    assert_field_paths_match(fields, expected_field_paths)
Пример #11
0
    def test_avro_schema_to_mce_fields_record_with_two_fields(self):

        examples = [SCHEMA_WITH_TWO_FIELD_RECORD]

        for schema in examples:
            fields = avro_schema_to_mce_fields(schema)
            self.assertEqual(2, len(fields))
Пример #12
0
def test_recursive_avro():
    schema = """
    {
        "type": "record",
        "name": "Recursive",
        "namespace": "com.linkedin",
        "fields": [{
            "name": "r",
            "type": {
                "type": "record",
                "name": "R",
                "fields": [
                    { "name" : "anIntegerField", "type" : "int" },
                    { "name": "aRecursiveField", "type": "com.linkedin.R"}
                ]
            }
        }]
    }
"""
    fields = avro_schema_to_mce_fields(schema)
    expected_field_paths = [
        "[version=2.0].[type=Recursive].[type=R].r",
        "[version=2.0].[type=Recursive].[type=R].r.[type=int].anIntegerField",
        "[version=2.0].[type=Recursive].[type=R].r.[type=R].aRecursiveField",
    ]
    assert_field_paths_match(fields, expected_field_paths)
Пример #13
0
 def _get_schema_fields(self, topic: str, schema: Schema,
                        is_key_schema: bool) -> List[SchemaField]:
     # Parse the schema and convert it to SchemaFields.
     fields: List[SchemaField] = []
     if schema.schema_type == "AVRO":
         cleaned_str: str = self.get_schema_str_replace_confluent_ref_avro(
             schema)
         # "value.id" or "value.[type=string]id"
         fields = schema_util.avro_schema_to_mce_fields(
             cleaned_str, is_key_schema=is_key_schema)
     elif schema.schema_type == "PROTOBUF":
         imported_schemas: List[
             ProtobufSchema] = self.get_schemas_from_confluent_ref_protobuf(
                 schema)
         base_name: str = topic.replace(".", "_")
         fields = protobuf_util.protobuf_schema_to_mce_fields(
             ProtobufSchema(
                 f"{base_name}-key.proto"
                 if is_key_schema else f"{base_name}-value.proto",
                 schema.schema_str,
             ),
             imported_schemas,
             is_key_schema=is_key_schema,
         )
     else:
         self.report.report_warning(
             topic,
             f"Parsing kafka schema type {schema.schema_type} is currently not implemented",
         )
     return fields
Пример #14
0
    def get_schema_fields_for_column(
        self,
        dataset_name: str,
        column: Dict[Any, Any],
        pk_constraints: Optional[Dict[Any, Any]] = None,
    ) -> List[SchemaField]:

        fields = super().get_schema_fields_for_column(dataset_name, column,
                                                      pk_constraints)

        if self._COMPLEX_TYPE.match(fields[0].nativeDataType) and isinstance(
                fields[0].type.type, NullTypeClass):
            assert len(fields) == 1
            field = fields[0]
            # Get avro schema for subfields along with parent complex field
            avro_schema = self.get_avro_schema_from_native_data_type(
                field.nativeDataType, column["name"])

            newfields = schema_util.avro_schema_to_mce_fields(
                json.dumps(avro_schema), default_nullable=True)

            # First field is the parent complex field
            newfields[0].nullable = field.nullable
            newfields[0].description = field.description
            newfields[0].isPartOfKey = field.isPartOfKey
            return newfields

        return fields
Пример #15
0
def test_avro_schema_to_mce_fields_with_default():
    schema = SCHEMA_WITH_DEFAULT_VALUE

    fields = avro_schema_to_mce_fields(schema)
    assert len(fields) == 1
    assert fields[0].description and "custom, default value" in fields[
        0].description
Пример #16
0
def test_avro_schema_to_mce_fields_toplevel_isnt_a_record():
    schema = """
{
  "type": "string"
}
"""
    fields = avro_schema_to_mce_fields(schema)
    expected_field_paths = ["[version=2.0].[type=string]"]
    assert_field_paths_match(fields, expected_field_paths)
Пример #17
0
def test_ignore_exceptions():
    malformed_schema: str = """
  "name": "event_ts",
  "type": "long",
  "logicalType": "timestamp-millis",
  "tags": [
    "business-timestamp"
  ]
"""
    fields: List[SchemaField] = avro_schema_to_mce_fields(malformed_schema)
    assert not fields
Пример #18
0
    def test_avro_schema_to_mce_fields_events_with_nullable_fields(self):

        examples = [
            SCHEMA_WITH_OPTIONAL_FIELD_VIA_UNION_TYPE,
            SCHEMA_WITH_OPTIONAL_FIELD_VIA_UNION_TYPE_NULL_ISNT_FIRST_IN_UNION,
            SCHEMA_WITH_OPTIONAL_FIELD_VIA_PRIMITIVE_TYPE,
        ]

        for schema in examples:
            fields = avro_schema_to_mce_fields(schema)
            self.assertEqual(1, len(fields))
            self.assertTrue(fields[0].nullable)
Пример #19
0
    def test_avro_schema_to_mce_fields_events_with_nullable_fields(self):

        events = [
            EXAMPLE_EVENT_OPTIONAL_FIELD_VIA_UNION_TYPE,
            EXAMPLE_EVENT_OPTIONAL_FIELD_VIA_UNION_TYPE_NULL_ISNT_FIRST_IN_UNION,
            EXAMPLE_EVENT_OPTIONAL_FIELD_VIA_PRIMITIVE_TYPE,
        ]

        for event in events:
            fields = avro_schema_to_mce_fields(event)
            self.assertEqual(1, len(fields))
            self.assertTrue(fields[0].nullable)
Пример #20
0
def test_avro_sample_payment_schema_to_mce_fields_with_nesting():
    schema = """
{
  "type": "record",
  "name": "Payment",
  "namespace": "some.event.namespace",
  "fields": [
    {"name": "id", "type": "string"},
    {"name": "amount", "type": "double"},
    {"name": "name","type": "string","default": ""},
    {"name": "phoneNumber",
     "type": [{
         "type": "record",
         "name": "PhoneNumber",
         "fields": [{
             "name": "areaCode",
             "type": "string",
             "default": ""
             }, {
             "name": "countryCode",
             "type": "string",
             "default": ""
             }, {
             "name": "prefix",
             "type": "string",
             "default": ""
             }, {
             "name": "number",
             "type": "string",
             "default": ""
             }]
         },
         "null"
     ],
     "default": "null"
    }
  ]
}
"""
    fields = avro_schema_to_mce_fields(schema)
    expected_field_paths = [
        "[version=2.0].[type=Payment].[type=string].id",
        "[version=2.0].[type=Payment].[type=double].amount",
        "[version=2.0].[type=Payment].[type=string].name",
        "[version=2.0].[type=Payment].[type=PhoneNumber].phoneNumber",
        "[version=2.0].[type=Payment].[type=PhoneNumber].phoneNumber.[type=string].areaCode",
        "[version=2.0].[type=Payment].[type=PhoneNumber].phoneNumber.[type=string].countryCode",
        "[version=2.0].[type=Payment].[type=PhoneNumber].phoneNumber.[type=string].prefix",
        "[version=2.0].[type=Payment].[type=PhoneNumber].phoneNumber.[type=string].number",
    ]
    assert_field_paths_match(fields, expected_field_paths)
Пример #21
0
    def _extract_record(self, topic: str) -> MetadataChangeEvent:
        logger.debug(f"topic = {topic}")
        platform = "kafka"
        dataset_name = topic
        actor = "urn:li:corpuser:etl"
        sys_time = get_sys_time()

        dataset_snapshot = DatasetSnapshot(
            urn=
            f"urn:li:dataset:(urn:li:dataPlatform:{platform},{dataset_name},{self.source_config.env})",
            aspects=[],  # we append to this list later on
        )
        dataset_snapshot.aspects.append(Status(removed=False))

        # Fetch schema from the registry.
        has_schema = True
        try:
            registered_schema = self.schema_registry_client.get_latest_version(
                topic + "-value")
            schema = registered_schema.schema
        except Exception as e:
            self.report.report_warning(topic, f"failed to get schema: {e}")
            has_schema = False

        # Parse the schema
        fields: List[SchemaField] = []
        if has_schema and schema.schema_type == "AVRO":
            fields = schema_util.avro_schema_to_mce_fields(schema.schema_str)
        elif has_schema:
            self.report.report_warning(
                topic,
                f"unable to parse kafka schema type {schema.schema_type}")

        if has_schema:
            schema_metadata = SchemaMetadata(
                schemaName=topic,
                version=0,
                hash=str(schema._hash),
                platform=f"urn:li:dataPlatform:{platform}",
                platformSchema=KafkaSchema(documentSchema=schema.schema_str),
                fields=fields,
                created=AuditStamp(time=sys_time, actor=actor),
                lastModified=AuditStamp(time=sys_time, actor=actor),
            )
            dataset_snapshot.aspects.append(schema_metadata)

        metadata_record = MetadataChangeEvent(
            proposedSnapshot=dataset_snapshot)
        return metadata_record
Пример #22
0
 def _get_schema_fields(self, pulsar_topic: PulsarTopic,
                        schema: PulsarSchema,
                        is_key_schema: bool) -> List[SchemaField]:
     # Parse the schema and convert it to SchemaFields.
     fields: List[SchemaField] = []
     if schema.schema_type == "AVRO" or schema.schema_type == "JSON":
         # Extract fields from schema and get the FQN for the schema
         fields = schema_util.avro_schema_to_mce_fields(
             schema.schema_str, is_key_schema=is_key_schema)
     else:
         self.report.report_warning(
             pulsar_topic.fullname,
             f"Parsing Pulsar schema type {schema.schema_type} is currently not implemented",
         )
     return fields
Пример #23
0
def test_key_schema_handling():
    """Tests key schema handling"""
    schema = """
    {
        "type": "record",
        "name": "ABFooUnion",
        "namespace": "com.linkedin",
        "fields": [{
            "name": "a",
            "type": [ {
                "type": "record",
                "name": "A",
                "fields": [{ "name": "f", "type": "string" } ]
                }, {
                "type": "record",
                "name": "B",
                "fields": [{ "name": "f", "type": "string" } ]
                }, {
                "type": "array",
                "items": {
                    "type": "array",
                    "items": [
                        "null",
                        {
                            "type": "record",
                            "name": "Foo",
                            "fields": [{ "name": "f", "type": "long" }]
                        }
                    ]
                }
        }]
        }]
    }
"""
    fields: List[SchemaField] = avro_schema_to_mce_fields(schema,
                                                          is_key_schema=True)
    expected_field_paths: List[str] = [
        "[version=2.0].[key=True].[type=ABFooUnion].[type=union].a",
        "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=A].a",
        "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=A].a.[type=string].f",
        "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=B].a",
        "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=B].a.[type=string].f",
        "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo].a",
        "[version=2.0].[key=True].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo].a.[type=long].f",
    ]
    assret_field_paths_match(fields, expected_field_paths)
    for f in fields:
        assert f.isPartOfKey
 def _get_schema_fields(self, topic: str, schema: Schema,
                        is_key_schema: bool) -> List[SchemaField]:
     # Parse the schema and convert it to SchemaFields.
     fields: List[SchemaField] = []
     if schema.schema_type == "AVRO":
         cleaned_str: str = self.get_schema_str_replace_confluent_ref_avro(
             schema)
         # "value.id" or "value.[type=string]id"
         fields = schema_util.avro_schema_to_mce_fields(
             cleaned_str, is_key_schema=is_key_schema)
     else:
         self.report.report_warning(
             topic,
             f"Parsing kafka schema type {schema.schema_type} is currently not implemented",
         )
     return fields
Пример #25
0
def test_mce_avro_parses_okay():
    """This test helps to exercise the complexity in parsing and catch unexpected regressions."""
    schema = Path(
        os.path.join(
            os.path.dirname(__file__),
            "..",
            "..",
            "src",
            "datahub",
            "metadata",
            "schema.avsc",
        )).read_text()
    fields = avro_schema_to_mce_fields(schema)
    assert len(fields)
    # Ensure that all the paths corresponding to the AVRO fields are unique.
    assert_field_paths_are_unique(fields)
    log_field_paths(fields)
Пример #26
0
def test_needs_disambiguation_nested_union_of_records_with_same_field_name():
    schema = """
    {
        "type": "record",
        "name": "ABFooUnion",
        "namespace": "com.linkedin",
        "fields": [{
            "name": "a",
            "type": [ {
                "type": "record",
                "name": "A",
                "fields": [{ "name": "f", "type": "string" } ]
                }, {
                "type": "record",
                "name": "B",
                "fields": [{ "name": "f", "type": "string" } ]
                }, {
                "type": "array",
                "items": {
                    "type": "array",
                    "items": [
                        "null",
                        {
                            "type": "record",
                            "name": "Foo",
                            "fields": [{ "name": "f", "type": "long" }]
                        }
                    ]
                }
        }]
        }]
    }
"""
    fields = avro_schema_to_mce_fields(schema)
    expected_field_paths: List[str] = [
        "[version=2.0].[type=ABFooUnion].[type=union].a",
        "[version=2.0].[type=ABFooUnion].[type=union].[type=A].a",
        "[version=2.0].[type=ABFooUnion].[type=union].[type=A].a.[type=string].f",
        "[version=2.0].[type=ABFooUnion].[type=union].[type=B].a",
        "[version=2.0].[type=ABFooUnion].[type=union].[type=B].a.[type=string].f",
        "[version=2.0].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo].a",
        "[version=2.0].[type=ABFooUnion].[type=union].[type=array].[type=array].[type=Foo].a.[type=long].f",
    ]
    assert_field_paths_match(fields, expected_field_paths)
Пример #27
0
def test_logical_types():
    schema: str = """
{
    "type": "record",
    "name": "test_logical_types",
    "fields":  [
        {"name": "decimal_logical", "type": "bytes", "logicalType": "decimal", "precision": 4, "scale": 2},
        {"name": "uuid_logical", "type": "string", "logicalType": "uuid"},
        {"name": "date_logical", "type": "int", "logicalType": "date"},
        {"name": "time_millis_logical", "type": "int", "logicalType": "time-millis"},
        {"name": "time_micros_logical", "type": "long", "logicalType": "time-micros"},
        {"name": "timestamp_millis_logical", "type": "long", "logicalType": "timestamp-millis"},
        {"name": "timestamp_micros_logical", "type": "long", "logicalType": "timestamp-micros"}
    ]
}
    """
    fields: List[SchemaField] = avro_schema_to_mce_fields(schema,
                                                          is_key_schema=False)
    # validate field paths
    expected_field_paths: List[str] = [
        "[version=2.0].[type=test_logical_types].[type=bytes].decimal_logical",
        "[version=2.0].[type=test_logical_types].[type=string].uuid_logical",
        "[version=2.0].[type=test_logical_types].[type=int].date_logical",
        "[version=2.0].[type=test_logical_types].[type=int].time_millis_logical",
        "[version=2.0].[type=test_logical_types].[type=long].time_micros_logical",
        "[version=2.0].[type=test_logical_types].[type=long].timestamp_millis_logical",
        "[version=2.0].[type=test_logical_types].[type=long].timestamp_micros_logical",
    ]
    assert_field_paths_match(fields, expected_field_paths)

    # validate field types.
    expected_types: List[Type] = [
        NumberTypeClass,
        StringTypeClass,
        DateTypeClass,
        TimeTypeClass,
        TimeTypeClass,
        TimeTypeClass,
        TimeTypeClass,
    ]
    assert expected_types == [type(field.type.type) for field in fields]
Пример #28
0
def get_schema_fields_for_hive_column(
    hive_column_name: str,
    hive_column_type: str,
    description: Optional[str] = None,
    default_nullable: bool = False,
    is_part_of_key: bool = False,
) -> List[SchemaField]:
    avro_schema_json = get_avro_schema_for_hive_column(
        hive_column_name=hive_column_name, hive_column_type=hive_column_type)
    schema_fields = avro_schema_to_mce_fields(
        avro_schema_string=json.dumps(avro_schema_json),
        default_nullable=default_nullable,
    )
    assert schema_fields
    if HiveColumnToAvroConverter.is_primitive_hive_type(hive_column_type):
        # Primitive avro schema does not have any field names. Append it to fieldPath.
        schema_fields[0].fieldPath += f".{hive_column_name}"
    if description:
        schema_fields[0].description = description
    schema_fields[0].isPartOfKey = is_part_of_key
    return schema_fields
Пример #29
0
def test_avro_schema_namespacing():
    schema = """
{
  "type": "record",
  "name": "name",
  "namespace": "should.not.show.up.namespace",
  "fields": [
    {
      "name": "aStringField",
      "type": "string",
      "doc": "some docs",
      "default": "this is custom, default value"
    }
  ]
}
"""
    fields = avro_schema_to_mce_fields(schema)
    expected_field_paths = [
        "[version=2.0].[type=name].[type=string].aStringField",
    ]
    assert_field_paths_match(fields, expected_field_paths)
Пример #30
0
def test_union_with_nested_record_of_union():
    schema = """
    {
        "type": "record",
        "name": "UnionSample",
        "namespace": "com.linkedin",
        "fields": [
            {
                "name": "aUnion",
                "type": [
                    "boolean",
                    {
                        "type": "record",
                        "name": "Rcd",
                        "fields": [
                            {
                                "name": "aNullableStringField",
                                "type": ["null", "string"]
                            }
                        ]
                    }
                ]
            }
        ]
    }
"""
    fields = avro_schema_to_mce_fields(schema)
    expected_field_paths = [
        "[version=2.0].[type=UnionSample].[type=union].aUnion",
        "[version=2.0].[type=UnionSample].[type=union].[type=boolean].aUnion",
        "[version=2.0].[type=UnionSample].[type=union].[type=Rcd].aUnion",
        "[version=2.0].[type=UnionSample].[type=union].[type=Rcd].aUnion.[type=string].aNullableStringField",
    ]
    assert_field_paths_match(fields, expected_field_paths)
    assert isinstance(fields[3].type.type, StringTypeClass)
    assert fields[0].nativeDataType == "union"
    assert fields[1].nativeDataType == "boolean"
    assert fields[2].nativeDataType == "Rcd"
    assert fields[3].nativeDataType == "string"