Пример #1
0
    def setUp(self) -> None:
        super(TestQueryJoin, self).setUp()
        # Display full diffs
        self.maxDiff = None
        self.tbl1_col = ColumnMetadata('field', '', '', 0)
        self.left_table_metadata = TableMetadata('hive', 'gold',
                                                 'test_schema1', 'test_table1',
                                                 'test_table1 desc',
                                                 [self.tbl1_col])
        self.tbl2_col = ColumnMetadata('field', '', '', 0)
        self.right_table_metadata = TableMetadata('hive', 'gold',
                                                  'test_schema1',
                                                  'test_table2',
                                                  'test_table2 desc',
                                                  [self.tbl2_col])
        self.query_metadata = QueryMetadata(
            sql="select * from table a where a.field > 3",
            tables=[self.left_table_metadata, self.right_table_metadata])

        self.query_join_metadata = QueryJoinMetadata(
            left_table=self.left_table_metadata,
            right_table=self.right_table_metadata,
            left_column=self.tbl1_col,
            right_column=self.tbl2_col,
            join_type='inner join',
            join_operator='=',
            join_sql=
            'test_table1 = join test_table2 on test_tabl1.field = test_table2.field',
            query_metadata=self.query_metadata)
        self._expected_key = ('inner-join-'
                              'hive://gold.test_schema1/test_table1/field-'
                              '=-'
                              'hive://gold.test_schema1/test_table2/field')
    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """
        for key, group in groupby(self._get_raw_extract_iter(), self._get_table_key):
            columns = []

            for row in group:
                last_row = row
                column = None
                if row['is_partition_col'] == 1:
                    # create add a badge to indicate partition column
                    column = ColumnMetadata(row['col_name'], row['col_description'],
                                            row['col_type'], row['col_sort_order'], [PARTITION_BADGE])
                else:
                    column = ColumnMetadata(row['col_name'], row['col_description'],
                                            row['col_type'], row['col_sort_order'])
                columns.append(column)
            is_view = last_row['is_view'] == 1
            yield TableMetadata('hive', self._cluster,
                                last_row['schema'],
                                last_row['name'],
                                last_row['description'],
                                columns,
                                is_view=is_view)
Пример #3
0
    def _iterate_over_cols(self, parent, column, cols, total_cols):
        # type: (str, str, List[ColumnMetadata()], int) -> int
        if len(parent) > 0:
            col_name = '{parent}.{field}'.format(parent=parent, field=column['name'])
        else:
            col_name = column['name']

        if column['type'] == 'RECORD':
            col = ColumnMetadata(
                name=col_name,
                description=column.get('description', ''),
                col_type=column['type'],
                sort_order=total_cols)
            cols.append(col)
            total_cols += 1
            for field in column['fields']:
                total_cols = self._iterate_over_cols(col_name, field, cols, total_cols)
            return total_cols
        else:
            col = ColumnMetadata(
                name=col_name,
                description=column.get('description', ''),
                col_type=column['type'],
                sort_order=total_cols)
            cols.append(col)
            return total_cols + 1
    def test_tags_arent_populated_from_empty_list_and_str(self):
        # type: () -> None
        self.table_metadata6 = TableMetadata(
            'hive',
            'gold',
            'test_schema6',
            'test_table6',
            'test_table6', [
                ColumnMetadata('test_id1', 'description of test_table1',
                               'bigint', 0)
            ],
            tags=[])

        self.table_metadata7 = TableMetadata(
            'hive',
            'gold',
            'test_schema7',
            'test_table7',
            'test_table7', [
                ColumnMetadata('test_id1', 'description of test_table1',
                               'bigint', 0)
            ],
            tags="")

        # Test table tag fields are not populated from empty List
        node_row = self.table_metadata6.next_node()
        while node_row:
            self.assertNotEqual(node_row.get('LABEL'), 'Tag')
            node_row = self.table_metadata6.next_node()

        # Test table tag fields are not populated from empty str
        node_row = self.table_metadata7.next_node()
        while node_row:
            self.assertNotEqual(node_row.get('LABEL'), 'Tag')
            node_row = self.table_metadata7.next_node()
Пример #5
0
    def _iterate_over_cols(self,
                           parent: str,
                           column: Dict[str, str],
                           cols: List[ColumnMetadata],
                           total_cols: int) -> int:
        if len(parent) > 0:
            col_name = '{parent}.{field}'.format(parent=parent, field=column['name'])
        else:
            col_name = column['name']

        if column['type'] == 'RECORD':
            col = ColumnMetadata(
                name=col_name,
                description=column.get('description', ''),
                col_type=column['type'],
                sort_order=total_cols)
            cols.append(col)
            total_cols += 1
            for field in column['fields']:
                # TODO field is actually a TableFieldSchema, per
                # https://cloud.google.com/bigquery/docs/reference/rest/v2/tables#TableFieldSchema
                # however it's typed as str, which is incorrect. Work-around by casting.
                field_casted = cast(Dict[str, str], field)
                total_cols = self._iterate_over_cols(col_name, field_casted, cols, total_cols)
            return total_cols
        else:
            col = ColumnMetadata(
                name=col_name,
                description=column.get('description', ''),
                col_type=column['type'],
                sort_order=total_cols)
            cols.append(col)
            return total_cols + 1
Пример #6
0
    def test_extraction_one_object(self, mock_salesforce: Any) -> None:
        mock_salesforce.return_value = MockSalesForce()
        config_dict: Dict = {
            f"extractor.salesforce_metadata.{SalesForceExtractor.OBJECT_NAMES_KEY}": [
                "Account"
            ],
            **self.config,
        }
        conf = ConfigFactory.from_dict(config_dict)

        mock_salesforce.return_value = MockSalesForce()
        extractor = SalesForceExtractor()
        extractor.init(Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope()))
        result = extractor.extract()
        self.assertIsInstance(result, TableMetadata)

        expected = TableMetadata(
            "salesforce",
            "gold",
            "default",
            "Account",
            None,
            [
                ColumnMetadata("Id", "The Account Id", "id", 0, []),
                ColumnMetadata("isDeleted", "Deleted?", "bool", 1, []),
            ],
            False,
            [],
        )

        self.assertEqual(expected.__repr__(), result.__repr__())

        self.assertIsNone(extractor.extract())
 def test_create_table_metadata(self) -> None:
     scraped = ScrapedTableMetadata(schema="test_schema1", table="test_table1")
     scraped.set_columns([ScrapedColumnMetadata(name="a", description=None, data_type="string", sort_order=0),
                          ScrapedColumnMetadata(name="b", description=None, data_type="int", sort_order=1)])
     created_metadata = self.dExtractor.create_table_metadata(scraped)
     expected = TableMetadata("test_database", "test_cluster", "test_schema1", "test_table1", description=None,
                              columns=[ColumnMetadata("a", None, "string", 0),
                                       ColumnMetadata("b", None, "int", 1)])
     self.assertEqual(str(expected), str(created_metadata))
Пример #8
0
    def test_extraction_multiple_objects(self, mock_salesforce: Any) -> None:
        mock_salesforce.return_value = MockSalesForce()
        config_dict: Dict = {
            f"extractor.salesforce_metadata.{SalesForceExtractor.OBJECT_NAMES_KEY}": [
                "Account",
                "Profile",
            ],
            **self.config,
        }
        conf = ConfigFactory.from_dict(config_dict)

        mock_salesforce.return_value = MockSalesForce()
        extractor = SalesForceExtractor()
        extractor.init(Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope()))

        results = [extractor.extract(), extractor.extract()]
        for result in results:
            self.assertIsInstance(result, TableMetadata)

        expecteds = [
            TableMetadata(
                "salesforce",
                "gold",
                "default",
                "Account",
                None,
                [
                    ColumnMetadata("Id", "The Account Id", "id", 0, []),
                    ColumnMetadata("isDeleted", "Deleted?", "bool", 1, []),
                ],
                False,
                [],
            ),
            TableMetadata(
                "salesforce",
                "gold",
                "default",
                "Profile",
                None,
                [
                    # These columns are sorted alphabetically
                    ColumnMetadata("Business", "Important Bizness", "string", 0, []),
                    ColumnMetadata("Id", "The Profile Id", "id", 1, []),
                ],
                False,
                [],
            ),
        ]

        for result, expected in zip(results, expecteds):
            self.assertEqual(expected.__repr__(), result.__repr__())

        self.assertIsNone(extractor.extract())
Пример #9
0
    def test_hive_parser_with_failures(self) -> None:
        transformer = ComplexTypeTransformer()
        config = ConfigFactory.from_dict({
            PARSING_FUNCTION:
            'databuilder.utils.hive_complex_type_parser.parse_hive_type',
        })
        transformer.init(conf=config)

        column = ColumnMetadata('col1', 'array type', 'array<array<int>>', 0)
        table_metadata = TableMetadata('hive', 'gold', 'test_schema',
                                       'test_table', 'test_table', [column])

        default_scalar_type = ScalarTypeMetadata(name='col1',
                                                 parent=column,
                                                 type_str='array<array<int>>')

        with patch.object(transformer, '_parsing_function') as mock:
            mock.side_effect = MagicMock(
                side_effect=Exception('Could not parse'))

            result = transformer.transform(table_metadata)

            self.assertEqual(transformer.success_count, 0)
            self.assertEqual(transformer.failure_count, 1)
            for actual in result.columns:
                self.assertEqual(actual.get_type_metadata(),
                                 default_scalar_type)
Пример #10
0
    def test_transform_array_struct_nested_type(self) -> None:
        column = ColumnMetadata('col1', None,
                                'array<struct<nest1:int,nest2:int>>', 0)
        column.set_column_key(self.column_key)

        array_type = ArrayTypeMetadata(
            name='col1',
            parent=column,
            type_str='array<struct<nest1:int,nest2:int>>')
        inner_struct = StructTypeMetadata(
            name='_inner_',
            parent=array_type,
            type_str='struct<nest1:int,nest2:int>')
        inner_scalar_nest1 = ScalarTypeMetadata(name='nest1',
                                                parent=inner_struct,
                                                type_str='int')
        inner_scalar_nest2 = ScalarTypeMetadata(name='nest2',
                                                parent=inner_struct,
                                                type_str='int')

        array_type.array_inner_type = inner_struct
        inner_struct.struct_items = {
            'nest1': inner_scalar_nest1,
            'nest2': inner_scalar_nest2
        }
        inner_scalar_nest1.sort_order = 0
        inner_scalar_nest2.sort_order = 1

        actual = parse_hive_type(column.type, column.name, column)
        self.assertEqual(actual, array_type)
Пример #11
0
    def test_transform_union_as_nested_type(self) -> None:
        column = ColumnMetadata(
            'col1', None,
            'struct<nest1:uniontype<string,struct<c1:int,c2:string>>,'
            'nest2:uniontype<string,int>>', 0)
        column.set_column_key(self.column_key)

        struct_type = StructTypeMetadata(
            name='col1',
            parent=column,
            type_str='struct<nest1:uniontype<string,struct<c1:int,c2:string>>,'
            'nest2:uniontype<string,int>>')
        inner_scalar_nest1 = ScalarTypeMetadata(
            name='nest1',
            parent=struct_type,
            type_str='uniontype<string,struct<c1:int,c2:string>>')
        inner_scalar_nest2 = ScalarTypeMetadata(
            name='nest2', parent=struct_type, type_str='uniontype<string,int>')

        struct_type.struct_items = {
            'nest1': inner_scalar_nest1,
            'nest2': inner_scalar_nest2
        }
        inner_scalar_nest1.sort_order = 0
        inner_scalar_nest2.sort_order = 1

        actual = parse_hive_type(column.type, column.name, column)
        self.assertEqual(actual, struct_type)
Пример #12
0
    def test_transform_struct_map_array_nested_type(self) -> None:
        column = ColumnMetadata(
            'col1', None,
            'struct<nest1:map<string,array<int>>,nest2:array<string>>', 0)
        column.set_column_key(self.column_key)

        struct_type = StructTypeMetadata(
            name='col1',
            parent=column,
            type_str='struct<nest1:map<string,array<int>>,nest2:array<string>>'
        )
        inner_map = MapTypeMetadata(name='nest1',
                                    parent=struct_type,
                                    type_str='map<string,array<int>>')
        inner_map_key = ScalarTypeMetadata(name='_map_key',
                                           parent=inner_map,
                                           type_str='string')
        inner_map_array = ArrayTypeMetadata(name='_map_value',
                                            parent=inner_map,
                                            type_str='array<int>')
        inner_struct_array = ArrayTypeMetadata(name='nest2',
                                               parent=struct_type,
                                               type_str='array<string>')

        struct_type.struct_items = {
            'nest1': inner_map,
            'nest2': inner_struct_array
        }
        inner_map.map_key_type = inner_map_key
        inner_map.map_value_type = inner_map_array
        inner_map.sort_order = 0
        inner_struct_array.sort_order = 1

        actual = parse_hive_type(column.type, column.name, column)
        self.assertEqual(actual, struct_type)
Пример #13
0
    def test_transform_map_struct_nested_type(self) -> None:
        column = ColumnMetadata('col1', None,
                                'map<string,struct<nest1:int,nest2:int>>', 0)
        column.set_column_key(self.column_key)

        map_type = MapTypeMetadata(
            name='col1',
            parent=column,
            type_str='map<string,struct<nest1:int,nest2:int>>')
        map_key = ScalarTypeMetadata(name='_map_key',
                                     parent=map_type,
                                     type_str='string')
        inner_struct = StructTypeMetadata(
            name='_map_value',
            parent=map_type,
            type_str='struct<nest1:int,nest2:int>')
        inner_scalar_nest1 = ScalarTypeMetadata(name='nest1',
                                                parent=inner_struct,
                                                type_str='int')
        inner_scalar_nest2 = ScalarTypeMetadata(name='nest2',
                                                parent=inner_struct,
                                                type_str='int')

        map_type.map_key_type = map_key
        map_type.map_value_type = inner_struct
        inner_struct.struct_items = {
            'nest1': inner_scalar_nest1,
            'nest2': inner_scalar_nest2
        }
        inner_scalar_nest1.sort_order = 0
        inner_scalar_nest2.sort_order = 1

        actual = parse_hive_type(column.type, column.name, column)
        self.assertEqual(actual, map_type)
Пример #14
0
    def test_transform_map_type(self) -> None:
        column = ColumnMetadata('col1', None, 'map<string,map<string,int>>', 0)
        column.set_column_key(self.column_key)

        map_type = MapTypeMetadata(name='col1',
                                   parent=column,
                                   type_str='map<string,map<string,int>>')
        map_key = ScalarTypeMetadata(name='_map_key',
                                     parent=map_type,
                                     type_str='string')
        map_value = MapTypeMetadata(name='_map_value',
                                    parent=map_type,
                                    type_str='map<string,int>')
        inner_map_key = ScalarTypeMetadata(name='_map_key',
                                           parent=map_value,
                                           type_str='string')
        inner_scalar = ScalarTypeMetadata(name='_map_value',
                                          parent=map_value,
                                          type_str='int')

        map_type.map_key_type = map_key
        map_type.map_value_type = map_value
        map_value.map_key_type = inner_map_key
        map_value.map_value_type = inner_scalar

        actual = parse_hive_type(column.type, column.name, column)
        self.assertEqual(actual, map_type)
    def test_tags_populated_from_str(self):
        # type: () -> None
        self.table_metadata5 = TableMetadata('hive', 'gold', 'test_schema5', 'test_table5', 'test_table5', [
            ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0)], tags="tag3, tag4")

        # Test table tag field populated from str
        node_row = self.table_metadata5.next_node()
        actual = []
        while node_row:
            actual.append(node_row)
            node_row = self.table_metadata5.next_node()

        self.assertEqual(actual[2].get('LABEL'), 'Tag')
        self.assertEqual(actual[2].get('KEY'), 'tag3')
        self.assertEqual(actual[3].get('KEY'), 'tag4')

        relation_row = self.table_metadata5.next_relation()
        actual = []
        while relation_row:
            actual.append(relation_row)
            relation_row = self.table_metadata5.next_relation()

        # Table tag relationship
        expected_tab_tag_rel3 = {'END_KEY': 'tag3', 'START_LABEL': 'Table', 'END_LABEL':
                                 'Tag', 'START_KEY': 'hive://gold.test_schema5/test_table5',
                                 'TYPE': 'TAGGED_BY', 'REVERSE_TYPE': 'TAG'}
        expected_tab_tag_rel4 = {'END_KEY': 'tag4', 'START_LABEL': 'Table',
                                 'END_LABEL': 'Tag', 'START_KEY': 'hive://gold.test_schema5/test_table5',
                                 'TYPE': 'TAGGED_BY', 'REVERSE_TYPE': 'TAG'}
        self.assertEqual(actual[2], expected_tab_tag_rel3)
        self.assertEqual(actual[3], expected_tab_tag_rel4)
Пример #16
0
    def test_hive_parser_usage(self) -> None:
        transformer = ComplexTypeTransformer()
        config = ConfigFactory.from_dict({
            PARSING_FUNCTION:
            'databuilder.utils.hive_complex_type_parser.parse_hive_type',
        })
        transformer.init(conf=config)

        column = ColumnMetadata('col1', 'array type', 'array<array<int>>', 0)
        table_metadata = TableMetadata('hive', 'gold', 'test_schema',
                                       'test_table', 'test_table', [column])
        array_type = ArrayTypeMetadata(name='col1',
                                       parent=column,
                                       type_str='array<array<int>>')
        inner_array = ArrayTypeMetadata(name='_inner_',
                                        parent=array_type,
                                        type_str='array<int>')

        array_type.array_inner_type = inner_array

        result = transformer.transform(table_metadata)

        for actual in result.columns:
            self.assertTrue(
                isinstance(actual.get_type_metadata(), TypeMetadata))
            self.assertEqual(actual.get_type_metadata(), array_type)
            self.assertEqual(transformer.success_count, 1)
            self.assertEqual(transformer.failure_count, 0)
Пример #17
0
    def test_col_badge_field(self) -> None:
        self.table_metadata4 = TableMetadata('hive', 'gold', 'test_schema4', 'test_table4', 'test_table4', [
            ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0, ['col-badge1', 'col-badge2'])],
            is_view=False, attr1='uri', attr2='attr2')

        node_row = self.table_metadata4.next_node()
        actual = []
        while node_row:
            serialized_node_row = neo4_serializer.serialize_node(node_row)
            actual.append(serialized_node_row)
            node_row = self.table_metadata4.next_node()

        self.assertEqual(actual[4].get('KEY'), 'col-badge1')
        self.assertEqual(actual[5].get('KEY'), 'col-badge2')

        relation_row = self.table_metadata4.next_relation()
        actual = []
        while relation_row:
            serialized_relation_row = neo4_serializer.serialize_relationship(relation_row)
            actual.append(serialized_relation_row)
            relation_row = self.table_metadata4.next_relation()

        expected_col_badge_rel1 = {'END_KEY': 'col-badge1', 'START_LABEL': 'Column',
                                   'END_LABEL': 'Badge',
                                   'START_KEY': 'hive://gold.test_schema4/test_table4/test_id1',
                                   'TYPE': 'HAS_BADGE', 'REVERSE_TYPE': 'BADGE_FOR'}
        expected_col_badge_rel2 = {'END_KEY': 'col-badge2', 'START_LABEL': 'Column',
                                   'END_LABEL': 'Badge',
                                   'START_KEY': 'hive://gold.test_schema4/test_table4/test_id1',
                                   'TYPE': 'HAS_BADGE', 'REVERSE_TYPE': 'BADGE_FOR'}

        self.assertEqual(actual[4], expected_col_badge_rel1)
        self.assertEqual(actual[5], expected_col_badge_rel2)
    def test_extraction_with_database_specified(self):
        # type: () -> None
        with patch.object(SQLAlchemyExtractor,
                          '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute

            sql_execute.return_value = [{
                'schema': 'test_schema',
                'name': 'test_table',
                'description': 'a table for testing',
                'cluster': 'MY_CLUSTER',
                'is_view': 'false',
                'col_name': 'ds',
                'col_type': 'varchar',
                'col_description': None,
                'col_sort_order': 0
            }]

            extractor = SnowflakeMetadataExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = TableMetadata(
                self.database_key, 'MY_CLUSTER', 'test_schema', 'test_table',
                'a table for testing',
                [ColumnMetadata('ds', None, 'varchar', 0)])

            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
Пример #19
0
    def _get_extract_iter(self):
        # type: () -> Iterator[TableMetadata]
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """
        for _, group in groupby(self._get_raw_extract_iter(), self._get_table_key):
            columns = []

            for row in group:
                last_row = row
                columns.append(
                    ColumnMetadata(
                        row["col_name"],
                        row["col_description"],
                        row["col_type"],
                        row["col_sort_order"],
                    )
                )

            yield TableMetadata(
                self._database,
                self._cluster,
                last_row["schema"],
                last_row["name"],
                last_row["description"],
                columns,
                is_view=bool(last_row["is_view"]),
            )
    def _get_column_metadata(self, view_original_text):
        # type: (str) -> List[ColumnMetadata]
        """
        Get Column Metadata from VIEW_ORIGINAL_TEXT from TBLS table for Presto Views.
        Columns are sorted the same way as they appear in Presto Create View SQL.
        :param view_original_text:
        :return:
        """
        # remove encoded Presto View data prefix and suffix
        encoded_view_info = (view_original_text.split(
            PrestoViewMetadataExtractor.PRESTO_VIEW_PREFIX,
            1)[-1].rsplit(PrestoViewMetadataExtractor.PRESTO_VIEW_SUFFIX,
                          1)[0])

        # view_original_text is b64 encoded:
        # https://github.com/prestodb/presto/blob/43bd519052ba4c56ff1f4fc807075637ab5f4f10/presto-hive/src/main/java/com/facebook/presto/hive/HiveUtil.java#L602-L605
        decoded_view_info = base64.b64decode(encoded_view_info)
        columns = json.loads(decoded_view_info).get('columns')

        return [
            ColumnMetadata(name=column['name'],
                           description=None,
                           col_type=column['type'],
                           sort_order=i) for i, column in enumerate(columns)
        ]
    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        """
        It gets all tables and yields TableMetadata
        :return:
        """
        keyspaces = self._get_keyspaces()
        for keyspace in keyspaces:
            # system keyspaces
            if keyspace.startswith('system'):
                continue
            for table in self._get_tables(keyspace):
                if self._filter and not self._filter(keyspace, table):
                    continue

                columns = []

                columns_dict = self._get_columns(keyspace, table)
                for idx, (column_name, column) in enumerate(columns_dict.items()):
                    columns.append(ColumnMetadata(
                        column_name,
                        None,
                        column.cql_type,
                        idx
                    ))

                yield TableMetadata(
                    'cassandra',
                    self._cluster,
                    keyspace,
                    table,
                    None,
                    columns
                )
    def _get_extract_iter(self):
        # type: () -> Iterator[TableMetadata]
        """
        It gets all tables and yields TableMetadata
        :return:
        """
        for row in self._get_raw_extract_iter():
            columns = []

            for i in range(len(row['StorageDescriptor']['Columns'])):
                column = row['StorageDescriptor']['Columns'][i]
                columns.append(ColumnMetadata(
                    column['Name'],
                    column['Comment'] if 'Comment' in column else None,
                    column['Type'],
                    i
                ))

            yield TableMetadata(
                'glue',
                self._cluster,
                row['DatabaseName'],
                row['Name'],
                row['Description'] if 'Description' in row else None,
                columns
            )
Пример #23
0
    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        """
        It gets all tables and yields TableMetadata
        :return:
        """
        for row in self._get_raw_extract_iter():
            columns, i = [], 0

            for column in row['StorageDescriptor']['Columns'] \
                    + row.get('PartitionKeys', []):
                columns.append(ColumnMetadata(
                    column['Name'],
                    column['Comment'] if 'Comment' in column else None,
                    column['Type'],
                    i
                ))
                i += 1

            yield TableMetadata(
                'glue',
                self._cluster,
                row['DatabaseName'],
                row['Name'],
                row.get('Description') or row.get('Parameters', {}).get('comment'),
                columns,
                row.get('TableType') == 'VIRTUAL_VIEW',
            )
Пример #24
0
    def _get_column_values(self, manifest_columns: Dict, catalog_columns: Dict) -> List[ColumnMetadata]:
        """
        Iterates over the columns in the manifest file and creates a `ColumnMetadata` object
        with the combined information from the manifest file as well as the catalog file.

        :params manifest_columns: A dictionary of values from the manifest.json, the keys
            are column names and the values are column metadata
        :params catalog_columns: A dictionary of values from the catalog.json, the keys
            are column names and the values are column metadata
        :returns: A list of `ColumnMetadata` in Amundsen.
        """
        tbl_columns = []
        for catalog_col_name, catalog_col_content in catalog_columns.items():
            manifest_col_content = manifest_columns.get(catalog_col_name, {})
            if catalog_col_content:
                col_desc = None
                if self._extract_descriptions:
                    col_desc = manifest_col_content.get('description')

                # Only extract column-level tags IF converting to badges, Amundsen does not have column-level tags
                badges = None
                if self._extract_tags and self._dbt_tag_as == DBT_TAG_AS.BADGE:
                    badges = manifest_col_content.get('tags')

                col_metadata = ColumnMetadata(
                    name=self._default_sanitize(catalog_col_content['name']),
                    description=col_desc,
                    col_type=catalog_col_content['type'],
                    sort_order=catalog_col_content['index'],
                    badges=badges
                )
                tbl_columns.append(col_metadata)
        return tbl_columns
Пример #25
0
    def test_transform_invalid_struct_inner_type(self) -> None:
        column = ColumnMetadata(
            'col1', None, 'struct<nest1:varchar(256)å,'
            'nest2:<derived from deserializer>>', 0)
        column.set_column_key(self.column_key)

        with self.assertRaises(ParseException):
            parse_hive_type(column.type, column.name, column)
Пример #26
0
    def test_extraction_with_partition_badge(self) -> None:
        with patch.object(GlueExtractor, '_search_tables') as mock_search:
            mock_search.return_value = [test_table]

            extractor = GlueExtractor()
            extractor.init(conf=ConfigFactory.from_dict({
                GlueExtractor.PARTITION_BADGE_LABEL_KEY:
                "partition_key",
            }))
            actual = extractor.extract()
            expected = TableMetadata(
                'glue', 'gold', 'test_schema', 'test_table',
                'a table for testing', [
                    ColumnMetadata('col_id1', 'description of id1', 'bigint',
                                   0),
                    ColumnMetadata('col_id2', 'description of id2', 'bigint',
                                   1),
                    ColumnMetadata('is_active', None, 'boolean', 2),
                    ColumnMetadata('source', 'description of source',
                                   'varchar', 3),
                    ColumnMetadata('etl_created_at',
                                   'description of etl_created_at',
                                   'timestamp', 4),
                    ColumnMetadata('ds', None, 'varchar', 5),
                    ColumnMetadata(
                        'partition_key1',
                        'description of partition_key1',
                        'string',
                        6,
                        ["partition_key"],
                    ),
                ], False)
            self.assertEqual(expected.__repr__(), actual.__repr__())
Пример #27
0
    def test_tags_field(self) -> None:
        self.table_metadata4 = TableMetadata(
            'hive',
            'gold',
            'test_schema4',
            'test_table4',
            'test_table4', [
                ColumnMetadata('test_id1', 'description of test_table1',
                               'bigint', 0)
            ],
            is_view=False,
            tags=['tag1', 'tag2'],
            attr1='uri',
            attr2='attr2')

        node_row = self.table_metadata4.next_node()
        actual = []
        while node_row:
            node_row_serialized = neo4_serializer.serialize_node(node_row)
            actual.append(node_row_serialized)
            node_row = self.table_metadata4.next_node()

        self.assertEqual(actual[0].get('attr1'), 'uri')
        self.assertEqual(actual[0].get('attr2'), 'attr2')

        self.assertEqual(actual[2].get('LABEL'), 'Tag')
        self.assertEqual(actual[2].get('KEY'), 'tag1')
        self.assertEqual(actual[3].get('KEY'), 'tag2')

        relation_row = self.table_metadata4.next_relation()
        actual = []
        while relation_row:
            relation_row_serialized = neo4_serializer.serialize_relationship(
                relation_row)
            actual.append(relation_row_serialized)
            relation_row = self.table_metadata4.next_relation()

        # Table tag relationship
        expected_tab_tag_rel1 = {
            'END_KEY': 'tag1',
            'START_LABEL': 'Table',
            'END_LABEL': 'Tag',
            'START_KEY': 'hive://gold.test_schema4/test_table4',
            'TYPE': 'TAGGED_BY',
            'REVERSE_TYPE': 'TAG'
        }
        expected_tab_tag_rel2 = {
            'END_KEY': 'tag2',
            'START_LABEL': 'Table',
            'END_LABEL': 'Tag',
            'START_KEY': 'hive://gold.test_schema4/test_table4',
            'TYPE': 'TAGGED_BY',
            'REVERSE_TYPE': 'TAG'
        }

        self.assertEqual(actual[2], expected_tab_tag_rel1)
        self.assertEqual(actual[3], expected_tab_tag_rel2)
Пример #28
0
    def test_transform_no_complex_type(self) -> None:
        column = ColumnMetadata('col1', None, 'int', 0)
        column.set_column_key(self.column_key)

        scalar_type = ScalarTypeMetadata(name='col1',
                                         parent=column,
                                         type_str='int')

        actual = parse_hive_type(column.type, column.name, column)
        self.assertEqual(actual, scalar_type)
    def test_extraction_with_default_conf(self, mock_columns, mock_tables,
                                          mock_keyspaces):
        # type: () -> None
        mock_keyspaces.return_value = {'test_schema': None}
        mock_tables.return_value = {'test_table': None}
        columns_dict = OrderedDict()
        columns_dict['id'] = CassandraColumnMetadata(None, 'id', 'int')
        columns_dict['txt'] = CassandraColumnMetadata(None, 'txt', 'text')
        mock_columns.return_value = columns_dict

        extractor = CassandraExtractor()
        extractor.init(self.default_conf)
        actual = extractor.extract()
        expected = TableMetadata('cassandra', 'gold', 'test_schema',
                                 'test_table', None, [
                                     ColumnMetadata('id', None, 'int', 0),
                                     ColumnMetadata('txt', None, 'text', 1)
                                 ])
        self.assertEqual(expected.__repr__(), actual.__repr__())
        self.assertIsNone(extractor.extract())
    def test_extraction_with_multiple_views(self) -> None:
        with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute

            columns1 = {'columns': [{'name': 'xyz', 'type': 'varchar'},
                                    {'name': 'xyy', 'type': 'double'},
                                    {'name': 'aaa', 'type': 'int'},
                                    {'name': 'ab', 'type': 'varchar'}]}

            columns2 = {'columns': [{'name': 'xyy', 'type': 'varchar'},
                                    {'name': 'ab', 'type': 'double'},
                                    {'name': 'aaa', 'type': 'int'},
                                    {'name': 'xyz', 'type': 'varchar'}]}

            sql_execute.return_value = [
                {'tbl_id': 2,
                 'schema': 'test_schema2',
                 'name': 'test_view2',
                 'tbl_type': 'virtual_view',
                 'view_original_text': base64.b64encode(json.dumps(columns2).encode()).decode("utf-8")},
                {'tbl_id': 1,
                 'schema': 'test_schema1',
                 'name': 'test_view1',
                 'tbl_type': 'virtual_view',
                 'view_original_text': base64.b64encode(json.dumps(columns1).encode()).decode("utf-8")},
            ]

            extractor = PrestoViewMetadataExtractor()
            extractor.init(self.conf)
            actual_first_view = extractor.extract()
            expected_first_view = TableMetadata('presto', 'gold', 'test_schema2', 'test_view2', None,
                                                [ColumnMetadata(u'xyy', None, u'varchar', 0),
                                                 ColumnMetadata(u'ab', None, u'double', 1),
                                                 ColumnMetadata(u'aaa', None, u'int', 2),
                                                 ColumnMetadata(u'xyz', None, u'varchar', 3)],
                                                True)
            self.assertEqual(expected_first_view.__repr__(), actual_first_view.__repr__())

            actual_second_view = extractor.extract()
            expected_second_view = TableMetadata('presto', 'gold', 'test_schema1', 'test_view1', None,
                                                 [ColumnMetadata(u'xyz', None, u'varchar', 0),
                                                  ColumnMetadata(u'xyy', None, u'double', 1),
                                                  ColumnMetadata(u'aaa', None, u'int', 2),
                                                  ColumnMetadata(u'ab', None, u'varchar', 3)],
                                                 True)
            self.assertEqual(expected_second_view.__repr__(), actual_second_view.__repr__())

            self.assertIsNone(extractor.extract())