Exemplo n.º 1
0
    def test_transform_array_struct_nested_type(self) -> None:
        column = ColumnMetadata('col1', None,
                                'array<struct<nest1:int,nest2:int>>', 0)
        column.set_column_key(self.column_key)

        array_type = ArrayTypeMetadata(
            name='col1',
            parent=column,
            type_str='array<struct<nest1:int,nest2:int>>')
        inner_struct = StructTypeMetadata(
            name='_inner_',
            parent=array_type,
            type_str='struct<nest1:int,nest2:int>')
        inner_scalar_nest1 = ScalarTypeMetadata(name='nest1',
                                                parent=inner_struct,
                                                type_str='int')
        inner_scalar_nest2 = ScalarTypeMetadata(name='nest2',
                                                parent=inner_struct,
                                                type_str='int')

        array_type.array_inner_type = inner_struct
        inner_struct.struct_items = {
            'nest1': inner_scalar_nest1,
            'nest2': inner_scalar_nest2
        }
        inner_scalar_nest1.sort_order = 0
        inner_scalar_nest2.sort_order = 1

        actual = parse_hive_type(column.type, column.name, column)
        self.assertEqual(actual, array_type)
Exemplo n.º 2
0
    def test_hive_parser_usage(self) -> None:
        transformer = ComplexTypeTransformer()
        config = ConfigFactory.from_dict({
            PARSING_FUNCTION:
            'databuilder.utils.hive_complex_type_parser.parse_hive_type',
        })
        transformer.init(conf=config)

        column = ColumnMetadata('col1', 'array type', 'array<array<int>>', 0)
        table_metadata = TableMetadata('hive', 'gold', 'test_schema',
                                       'test_table', 'test_table', [column])
        array_type = ArrayTypeMetadata(name='col1',
                                       parent=column,
                                       type_str='array<array<int>>')
        inner_array = ArrayTypeMetadata(name='_inner_',
                                        parent=array_type,
                                        type_str='array<int>')

        array_type.array_inner_type = inner_array

        result = transformer.transform(table_metadata)

        for actual in result.columns:
            self.assertTrue(
                isinstance(actual.get_type_metadata(), TypeMetadata))
            self.assertEqual(actual.get_type_metadata(), array_type)
            self.assertEqual(transformer.success_count, 1)
            self.assertEqual(transformer.failure_count, 0)
Exemplo n.º 3
0
    def test_transform_struct_map_array_nested_type(self) -> None:
        column = ColumnMetadata(
            'col1', None,
            'struct<nest1:map<string,array<int>>,nest2:array<string>>', 0)
        column.set_column_key(self.column_key)

        struct_type = StructTypeMetadata(
            name='col1',
            parent=column,
            type_str='struct<nest1:map<string,array<int>>,nest2:array<string>>'
        )
        inner_map = MapTypeMetadata(name='nest1',
                                    parent=struct_type,
                                    type_str='map<string,array<int>>')
        inner_map_key = ScalarTypeMetadata(name='_map_key',
                                           parent=inner_map,
                                           type_str='string')
        inner_map_array = ArrayTypeMetadata(name='_map_value',
                                            parent=inner_map,
                                            type_str='array<int>')
        inner_struct_array = ArrayTypeMetadata(name='nest2',
                                               parent=struct_type,
                                               type_str='array<string>')

        struct_type.struct_items = {
            'nest1': inner_map,
            'nest2': inner_struct_array
        }
        inner_map.map_key_type = inner_map_key
        inner_map.map_value_type = inner_map_array
        inner_map.sort_order = 0
        inner_struct_array.sort_order = 1

        actual = parse_hive_type(column.type, column.name, column)
        self.assertEqual(actual, struct_type)
Exemplo n.º 4
0
    def test_transform_array_type(self) -> None:
        column = ColumnMetadata('col1', None, 'array<array<int>>', 0)
        column.set_column_key(self.column_key)

        array_type = ArrayTypeMetadata(name='col1',
                                       parent=column,
                                       type_str='array<array<int>>')
        inner_array = ArrayTypeMetadata(name='_inner_',
                                        parent=array_type,
                                        type_str='array<int>')

        array_type.array_inner_type = inner_array

        actual = parse_hive_type(column.type, column.name, column)
        self.assertEqual(actual, array_type)
Exemplo n.º 5
0
def parse_hive_type(type_str: str, name: str, parent: Union[ColumnMetadata, TypeMetadata]) -> TypeMetadata:
    type_str = type_str.lower()
    parsed_type = complex_type.parseString(type_str, parseAll=True)

    if parsed_type.scalar_type:
        return ScalarTypeMetadata(name=name,
                                  parent=parent,
                                  type_str=type_str)

    results = parsed_type[0]
    if parsed_type.array_type:
        array_type_metadata = ArrayTypeMetadata(name=name,
                                                parent=parent,
                                                type_str=type_str)
        array_inner_type = parse_hive_type(results.type, '_inner_', array_type_metadata)
        if not isinstance(array_inner_type, ScalarTypeMetadata):
            array_type_metadata.array_inner_type = array_inner_type
        return array_type_metadata
    elif parsed_type.map_type:
        map_type_metadata = MapTypeMetadata(name=name,
                                            parent=parent,
                                            type_str=type_str)
        map_type_metadata.map_key_type = parse_hive_type(results.key, '_map_key', map_type_metadata)
        map_type_metadata.map_value_type = parse_hive_type(results.type, '_map_value', map_type_metadata)
        return map_type_metadata
    elif parsed_type.struct_type:
        struct_type_metadata = StructTypeMetadata(name=name,
                                                  parent=parent,
                                                  type_str=type_str)
        struct_items = {}
        for index, result in enumerate(results):
            struct_items[result.name] = parse_hive_type(result.type, result.name, struct_type_metadata)
            struct_items[result.name].sort_order = index

        struct_type_metadata.struct_items = struct_items
        return struct_type_metadata
    else:
        raise Exception(f"Unrecognized type: {type_str}")
Exemplo n.º 6
0
    def test_transform_array_map_nested_type(self) -> None:
        column = ColumnMetadata('col1', None, 'array<map<string,int>>', 0)
        column.set_column_key(self.column_key)

        array_type = ArrayTypeMetadata(name='col1',
                                       parent=column,
                                       type_str='array<map<string,int>>')
        inner_map = MapTypeMetadata(name='_inner_',
                                    parent=array_type,
                                    type_str='map<string,int>')
        inner_map_key = ScalarTypeMetadata(name='_map_key',
                                           parent=inner_map,
                                           type_str='string')
        inner_scalar = ScalarTypeMetadata(name='_map_value',
                                          parent=inner_map,
                                          type_str='int')

        array_type.array_inner_type = inner_map
        inner_map.map_key_type = inner_map_key
        inner_map.map_value_type = inner_scalar

        actual = parse_hive_type(column.type, column.name, column)
        self.assertEqual(actual, array_type)
Exemplo n.º 7
0
    def _set_up_type_metadata(self,
                              parent_column: ColumnMetadata) -> TypeMetadata:
        array_type_metadata = ArrayTypeMetadata(
            name='has_nested_type',
            parent=parent_column,
            type_str='array<array<array<string>>>')
        nested_array_type_metadata_level1 = ArrayTypeMetadata(
            name='_inner_',
            parent=array_type_metadata,
            type_str='array<array<string>>')
        nested_array_type_metadata_level2 = ArrayTypeMetadata(
            name='_inner_',
            parent=nested_array_type_metadata_level1,
            type_str='array<string>')

        array_type_metadata.array_inner_type = nested_array_type_metadata_level1
        nested_array_type_metadata_level1.array_inner_type = nested_array_type_metadata_level2

        return array_type_metadata
Exemplo n.º 8
0
    def test_serialize_array_struct_type_metadata(self) -> None:
        column = ColumnMetadata('col1', None,
                                'array<struct<c1:array<string>,c2:string>>', 0)
        column.set_column_key(self.column_key)

        array_type_metadata = ArrayTypeMetadata(
            name='col1',
            parent=column,
            type_str='array<struct<c1:array<string>,c2:string>>')
        nested_struct_type_metadata_level1 = StructTypeMetadata(
            name='_inner_',
            parent=array_type_metadata,
            type_str='struct<c1:array<string>,c2:string>')
        nested_array_type_metadata_level2 = ArrayTypeMetadata(
            name='c1',
            parent=nested_struct_type_metadata_level1,
            type_str='array<string>')
        nested_scalar_type_metadata_level2 = ScalarTypeMetadata(
            name='c2',
            parent=nested_struct_type_metadata_level1,
            type_str='string')

        array_type_metadata.array_inner_type = nested_struct_type_metadata_level1
        nested_struct_type_metadata_level1.struct_items = {
            'c1': nested_array_type_metadata_level2,
            'c2': nested_scalar_type_metadata_level2
        }
        nested_array_type_metadata_level2.sort_order = 0
        nested_scalar_type_metadata_level2.sort_order = 1

        expected_nodes = [
            {
                'kind': 'array',
                'name': 'col1',
                'data_type': 'array<struct<c1:array<string>,c2:string>>',
                'LABEL': 'Type_Metadata',
                'KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1'
            },
            {
                'kind':
                'struct',
                'name':
                '_inner_',
                'data_type':
                'struct<c1:array<string>,c2:string>',
                'LABEL':
                'Type_Metadata',
                'KEY':
                'hive://gold.test_schema1/test_table1/col1/type/col1/_inner_'
            },
            {
                'kind':
                'array',
                'name':
                'c1',
                'data_type':
                'array<string>',
                'LABEL':
                'Type_Metadata',
                'sort_order:UNQUOTED':
                0,
                'KEY':
                'hive://gold.test_schema1/test_table1/col1/type/col1/_inner_/c1'
            },
            {
                'kind':
                'scalar',
                'name':
                'c2',
                'data_type':
                'string',
                'LABEL':
                'Type_Metadata',
                'sort_order:UNQUOTED':
                1,
                'KEY':
                'hive://gold.test_schema1/test_table1/col1/type/col1/_inner_/c2'
            },
        ]
        expected_rels = [{
            'END_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1',
            'START_KEY': 'hive://gold.test_schema1/test_table1/col1',
            'END_LABEL': 'Type_Metadata',
            'START_LABEL': 'Column',
            'TYPE': 'TYPE_METADATA',
            'REVERSE_TYPE': 'TYPE_METADATA_OF'
        }, {
            'END_KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_inner_',
            'START_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1',
            'END_LABEL': 'Type_Metadata',
            'START_LABEL': 'Type_Metadata',
            'TYPE': 'SUBTYPE',
            'REVERSE_TYPE': 'SUBTYPE_OF'
        }, {
            'END_KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_inner_/c1',
            'START_KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_inner_',
            'END_LABEL': 'Type_Metadata',
            'START_LABEL': 'Type_Metadata',
            'TYPE': 'SUBTYPE',
            'REVERSE_TYPE': 'SUBTYPE_OF'
        }, {
            'END_KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_inner_/c2',
            'START_KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_inner_',
            'END_LABEL': 'Type_Metadata',
            'START_LABEL': 'Type_Metadata',
            'TYPE': 'SUBTYPE',
            'REVERSE_TYPE': 'SUBTYPE_OF'
        }]

        node_row = array_type_metadata.next_node()
        actual = []
        while node_row:
            node_row_serialized = neo4_serializer.serialize_node(node_row)
            actual.append(node_row_serialized)
            node_row = array_type_metadata.next_node()
        for i in range(0, len(expected_nodes)):
            self.assertEqual(actual[i], expected_nodes[i])

        relation_row = array_type_metadata.next_relation()
        actual = []
        while relation_row:
            relation_row_serialized = neo4_serializer.serialize_relationship(
                relation_row)
            actual.append(relation_row_serialized)
            relation_row = array_type_metadata.next_relation()
        for i in range(0, len(expected_rels)):
            self.assertEqual(actual[i], expected_rels[i])
Exemplo n.º 9
0
    def test_serialize_array_map_type_metadata(self) -> None:
        column = ColumnMetadata('col1', None,
                                'array<map<string,array<string>>>', 0)
        column.set_column_key(self.column_key)

        array_type_metadata = ArrayTypeMetadata(
            name='col1',
            parent=column,
            type_str='array<map<string,array<string>>>')
        nested_map_type_metadata_level1 = MapTypeMetadata(
            name='_inner_',
            parent=array_type_metadata,
            type_str='map<string,array<string>>')
        nested_map_key = ScalarTypeMetadata(
            name='_map_key',
            parent=nested_map_type_metadata_level1,
            type_str='string')
        nested_array_type_metadata_level2 = ArrayTypeMetadata(
            name='_map_value',
            parent=nested_map_type_metadata_level1,
            type_str='array<string>')
        nested_scalar_type_metadata_level3 = ScalarTypeMetadata(
            name='_inner_',
            parent=nested_array_type_metadata_level2,
            type_str='string')

        array_type_metadata.array_inner_type = nested_map_type_metadata_level1
        nested_map_type_metadata_level1.map_key_type = nested_map_key
        nested_map_type_metadata_level1.map_value_type = nested_array_type_metadata_level2
        nested_array_type_metadata_level2.array_inner_type = nested_scalar_type_metadata_level3

        expected_nodes = [{
            'kind':
            'array',
            'data_type':
            'array<map<string,array<string>>>',
            'LABEL':
            'Type_Metadata',
            'name':
            'col1',
            'KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1'
        }, {
            'kind':
            'map',
            'data_type':
            'map<string,array<string>>',
            'LABEL':
            'Type_Metadata',
            'name':
            '_inner_',
            'KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_inner_'
        }, {
            'kind':
            'scalar',
            'data_type':
            'string',
            'LABEL':
            'Type_Metadata',
            'name':
            '_map_key',
            'KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_inner_/_map_key'
        }, {
            'kind':
            'array',
            'data_type':
            'array<string>',
            'LABEL':
            'Type_Metadata',
            'name':
            '_map_value',
            'KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_inner_/_map_value'
        }]
        expected_rels = [{
            'END_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1',
            'START_KEY': 'hive://gold.test_schema1/test_table1/col1',
            'END_LABEL': 'Type_Metadata',
            'START_LABEL': 'Column',
            'TYPE': 'TYPE_METADATA',
            'REVERSE_TYPE': 'TYPE_METADATA_OF'
        }, {
            'END_KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_inner_',
            'START_KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1',
            'END_LABEL': 'Type_Metadata',
            'START_LABEL': 'Type_Metadata',
            'TYPE': 'SUBTYPE',
            'REVERSE_TYPE': 'SUBTYPE_OF'
        }, {
            'END_KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_inner_/_map_key',
            'START_KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_inner_',
            'END_LABEL': 'Type_Metadata',
            'START_LABEL': 'Type_Metadata',
            'TYPE': 'SUBTYPE',
            'REVERSE_TYPE': 'SUBTYPE_OF'
        }, {
            'END_KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_inner_/_map_value',
            'START_KEY':
            'hive://gold.test_schema1/test_table1/col1/type/col1/_inner_',
            'END_LABEL': 'Type_Metadata',
            'START_LABEL': 'Type_Metadata',
            'TYPE': 'SUBTYPE',
            'REVERSE_TYPE': 'SUBTYPE_OF'
        }]

        node_row = array_type_metadata.next_node()
        actual = []
        while node_row:
            node_row_serialized = neo4_serializer.serialize_node(node_row)
            actual.append(node_row_serialized)
            node_row = array_type_metadata.next_node()
        for i in range(0, len(expected_nodes)):
            self.assertEqual(actual[i], expected_nodes[i])

        relation_row = array_type_metadata.next_relation()
        actual = []
        while relation_row:
            relation_row_serialized = neo4_serializer.serialize_relationship(
                relation_row)
            actual.append(relation_row_serialized)
            relation_row = array_type_metadata.next_relation()
        for i in range(0, len(expected_rels)):
            self.assertEqual(actual[i], expected_rels[i])
Exemplo n.º 10
0
    def test_serialize_struct_map_array_type_metadata(self) -> None:
        column = ColumnMetadata(
            'col1', None,
            'struct<c1:map<string,array<string>>,c2:array<string>>', 0)
        column.set_column_key(self.column_key)

        struct_type_metadata = StructTypeMetadata(
            name='col1',
            parent=column,
            type_str='struct<c1:map<string,array<string>>,c2:array<string>>')
        nested_map_type_metadata_level1 = MapTypeMetadata(
            name='c1',
            parent=struct_type_metadata,
            type_str='map<string,array<string>>',
            description='description of map')
        nested_map_key = ScalarTypeMetadata(
            name='_map_key',
            parent=nested_map_type_metadata_level1,
            type_str='string')
        nested_array_type_metadata_level2 = ArrayTypeMetadata(
            name='_map_value',
            parent=nested_map_type_metadata_level1,
            type_str='array<string>')
        nested_array_type_metadata_level1 = ArrayTypeMetadata(
            name='c2',
            parent=struct_type_metadata,
            type_str='array<string>',
            description='description of array')

        struct_type_metadata.struct_items = {
            'c1': nested_map_type_metadata_level1,
            'c2': nested_array_type_metadata_level1
        }
        nested_map_type_metadata_level1.map_key_type = nested_map_key
        nested_map_type_metadata_level1.map_value_type = nested_array_type_metadata_level2
        nested_map_type_metadata_level1.sort_order = 0
        nested_array_type_metadata_level1.sort_order = 1

        expected_nodes = [
            {
                'kind': 'struct',
                'name': 'col1',
                'LABEL': 'Type_Metadata',
                'data_type':
                'struct<c1:map<string,array<string>>,c2:array<string>>',
                'KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1'
            },
            {
                'kind': 'map',
                'name': 'c1',
                'data_type': 'map<string,array<string>>',
                'LABEL': 'Type_Metadata',
                'sort_order:UNQUOTED': 0,
                'KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1/c1'
            },
            {
                'description': 'description of map',
                'KEY':
                'hive://gold.test_schema1/test_table1/col1/type/col1/c1/_description',
                'LABEL': 'Description',
                'description_source': 'description'
            },
            {
                'kind':
                'scalar',
                'name':
                '_map_key',
                'data_type':
                'string',
                'LABEL':
                'Type_Metadata',
                'KEY':
                'hive://gold.test_schema1/test_table1/col1/type/col1/c1/_map_key'
            },
            {
                'kind':
                'array',
                'name':
                '_map_value',
                'data_type':
                'array<string>',
                'LABEL':
                'Type_Metadata',
                'KEY':
                'hive://gold.test_schema1/test_table1/col1/type/col1/c1/_map_value'
            },
            {
                'kind': 'array',
                'name': 'c2',
                'data_type': 'array<string>',
                'LABEL': 'Type_Metadata',
                'sort_order:UNQUOTED': 1,
                'KEY': 'hive://gold.test_schema1/test_table1/col1/type/col1/c2'
            },
            {
                'description': 'description of array',
                'KEY':
                'hive://gold.test_schema1/test_table1/col1/type/col1/c2/_description',
                'LABEL': 'Description',
                'description_source': 'description'
            },
        ]
        expected_rels = [
            {
                'END_KEY':
                'hive://gold.test_schema1/test_table1/col1/type/col1',
                'START_KEY': 'hive://gold.test_schema1/test_table1/col1',
                'END_LABEL': 'Type_Metadata',
                'START_LABEL': 'Column',
                'TYPE': 'TYPE_METADATA',
                'REVERSE_TYPE': 'TYPE_METADATA_OF'
            },
            {
                'END_KEY':
                'hive://gold.test_schema1/test_table1/col1/type/col1/c1',
                'START_KEY':
                'hive://gold.test_schema1/test_table1/col1/type/col1',
                'END_LABEL': 'Type_Metadata',
                'START_LABEL': 'Type_Metadata',
                'TYPE': 'SUBTYPE',
                'REVERSE_TYPE': 'SUBTYPE_OF'
            },
            {
                'END_KEY':
                'hive://gold.test_schema1/test_table1/col1/type/col1/c1/_description',
                'START_LABEL': 'Type_Metadata',
                'END_LABEL': 'Description',
                'START_KEY':
                'hive://gold.test_schema1/test_table1/col1/type/col1/c1',
                'TYPE': 'DESCRIPTION',
                'REVERSE_TYPE': 'DESCRIPTION_OF'
            },
            {
                'END_KEY':
                'hive://gold.test_schema1/test_table1/col1/type/col1/c1/_map_key',
                'START_KEY':
                'hive://gold.test_schema1/test_table1/col1/type/col1/c1',
                'END_LABEL': 'Type_Metadata',
                'START_LABEL': 'Type_Metadata',
                'TYPE': 'SUBTYPE',
                'REVERSE_TYPE': 'SUBTYPE_OF'
            },
            {
                'END_KEY':
                'hive://gold.test_schema1/test_table1/col1/type/col1/c1/_map_value',
                'START_KEY':
                'hive://gold.test_schema1/test_table1/col1/type/col1/c1',
                'END_LABEL': 'Type_Metadata',
                'START_LABEL': 'Type_Metadata',
                'TYPE': 'SUBTYPE',
                'REVERSE_TYPE': 'SUBTYPE_OF'
            },
            {
                'END_KEY':
                'hive://gold.test_schema1/test_table1/col1/type/col1/c2',
                'START_KEY':
                'hive://gold.test_schema1/test_table1/col1/type/col1',
                'END_LABEL': 'Type_Metadata',
                'START_LABEL': 'Type_Metadata',
                'TYPE': 'SUBTYPE',
                'REVERSE_TYPE': 'SUBTYPE_OF'
            },
            {
                'END_KEY':
                'hive://gold.test_schema1/test_table1/col1/type/col1/c2/_description',
                'START_LABEL': 'Type_Metadata',
                'END_LABEL': 'Description',
                'START_KEY':
                'hive://gold.test_schema1/test_table1/col1/type/col1/c2',
                'TYPE': 'DESCRIPTION',
                'REVERSE_TYPE': 'DESCRIPTION_OF'
            },
        ]

        node_row = struct_type_metadata.next_node()
        actual = []
        while node_row:
            node_row_serialized = neo4_serializer.serialize_node(node_row)
            actual.append(node_row_serialized)
            node_row = struct_type_metadata.next_node()
        for i in range(0, len(expected_nodes)):
            self.assertEqual(actual[i], expected_nodes[i])

        relation_row = struct_type_metadata.next_relation()
        actual = []
        while relation_row:
            relation_row_serialized = neo4_serializer.serialize_relationship(
                relation_row)
            actual.append(relation_row_serialized)
            relation_row = struct_type_metadata.next_relation()
        for i in range(0, len(expected_rels)):
            self.assertEqual(actual[i], expected_rels[i])