def create_table_metadata(self,
                           table: ScrapedTableMetadata) -> TableMetadata:
     '''Creates the amundsen table metadata object from the ScrapedTableMetadata object.'''
     amundsen_columns = []
     if table.columns:
         for column in table.columns:
             amundsen_columns.append(
                 ColumnMetadata(name=column.name,
                                description=column.description,
                                col_type=column.data_type,
                                sort_order=column.sort_order))
     description = table.get_table_description()
     return TableMetadata(self._db, self._cluster, table.schema,
                          table.table, description, amundsen_columns,
                          table.is_view)
    def test_extraction_with_single_result(self):
        # type: () -> None
        with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute
            table = {'schema': 'test_schema',
                     'name': 'test_table',
                     'description': 'a table for testing'}

            sql_execute.return_value = [
                self._union(
                    {'col_name': 'col_id1',
                     'col_type': 'bigint',
                     'col_description': 'description of id1',
                     'col_sort_order': 0}, table),
                self._union(
                    {'col_name': 'col_id2',
                     'col_type': 'bigint',
                     'col_description': 'description of id2',
                     'col_sort_order': 1}, table),
                self._union(
                    {'col_name': 'is_active',
                     'col_type': 'boolean',
                     'col_description': None,
                     'col_sort_order': 2}, table),
                self._union(
                    {'col_name': 'source',
                     'col_type': 'varchar',
                     'col_description': 'description of source',
                     'col_sort_order': 3}, table),
                self._union(
                    {'col_name': 'etl_created_at',
                     'col_type': 'timestamp',
                     'col_description': 'description of etl_created_at',
                     'col_sort_order': 4}, table),
                self._union(
                    {'col_name': 'ds',
                     'col_type': 'varchar',
                     'col_description': None,
                     'col_sort_order': 5}, table)
            ]

            extractor = HiveTableMetadataExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = TableMetadata('hive', 'gold', 'test_schema', 'test_table', 'a table for testing',
                                     [ColumnMetadata('col_id1', 'description of id1', 'bigint', 0),
                                      ColumnMetadata('col_id2', 'description of id2', 'bigint', 1),
                                      ColumnMetadata('is_active', None, 'boolean', 2),
                                      ColumnMetadata('source', 'description of source', 'varchar', 3),
                                      ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4),
                                      ColumnMetadata('ds', None, 'varchar', 5)])
            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
    def test_feature_table_extraction(self) -> None:
        self._init_extractor(programmatic_description_enabled=False)
        self.extractor._client.list_projects.return_value = ["default"]
        self._mock_feature_table()

        table = self.extractor.extract()
        self.extractor._client.get_entity.assert_called_with("driver_id",
                                                             project="default")
        expected = TableMetadata(
            database="feast",
            cluster="unittest-feast-instance",
            schema="default",
            name="driver_trips",
            description=None,
            columns=[
                ColumnMetadata("driver_id",
                               "Internal identifier of the driver", "INT64",
                               0),
                ColumnMetadata("trips_today", None, "INT32", 1),
            ],
        )

        self.assertEqual(expected.__repr__(), table.__repr__())
        self.assertIsNone(self.extractor.extract())
Exemplo n.º 4
0
    def test_tags_arent_populated_from_empty_list_and_str(self) -> None:
        self.table_metadata6 = TableMetadata(
            'hive',
            'gold',
            'test_schema6',
            'test_table6',
            'test_table6', [
                ColumnMetadata('test_id1', 'description of test_table1',
                               'bigint', 0)
            ],
            tags=[])

        self.table_metadata7 = TableMetadata(
            'hive',
            'gold',
            'test_schema7',
            'test_table7',
            'test_table7', [
                ColumnMetadata('test_id1', 'description of test_table1',
                               'bigint', 0)
            ],
            tags="")

        # Test table tag fields are not populated from empty List
        node_row = self.table_metadata6.next_node()
        while node_row:
            node_row_serialized = neo4_serializer.serialize_node(node_row)
            self.assertNotEqual(node_row_serialized.get('LABEL'), 'Tag')
            node_row = self.table_metadata6.next_node()

        # Test table tag fields are not populated from empty str
        node_row = self.table_metadata7.next_node()
        while node_row:
            node_row_serialized = neo4_serializer.serialize_node(node_row)
            self.assertNotEqual(node_row_serialized.get('LABEL'), 'Tag')
            node_row = self.table_metadata7.next_node()
 def test_create_table_metadata(self) -> None:
     scraped = ScrapedTableMetadata(schema="test_schema1",
                                    table="test_table1")
     scraped.set_columns([
         ScrapedColumnMetadata(name="a",
                               description=None,
                               data_type="string",
                               sort_order=0),
         ScrapedColumnMetadata(name="b",
                               description=None,
                               data_type="int",
                               sort_order=1)
     ])
     created_metadata = self.dExtractor.create_table_metadata(scraped)
     expected = TableMetadata("test_database",
                              "test_cluster",
                              "test_schema1",
                              "test_table1",
                              description=None,
                              columns=[
                                  ColumnMetadata("a", None, "string", 0),
                                  ColumnMetadata("b", None, "int", 1)
                              ])
     self.assertEqual(str(expected), str(created_metadata))
    def test_col_badge_field(self) -> None:
        self.table_metadata4 = TableMetadata(
            'hive',
            'gold',
            'test_schema4',
            'test_table4',
            'test_table4', [
                ColumnMetadata('test_id1', 'description of test_table1',
                               'bigint', 0, ['col-badge1', 'col-badge2'])
            ],
            is_view=False,
            attr1='uri',
            attr2='attr2')

        node_row = self.table_metadata4.next_node()
        actual = []
        while node_row:
            actual.append(node_row)
            node_row = self.table_metadata4.next_node()

        self.assertEqual(actual[4].get('KEY'), 'col-badge1')
        self.assertEqual(actual[5].get('KEY'), 'col-badge2')

        relation_row = self.table_metadata4.next_relation()
        actual = []
        while relation_row:
            actual.append(relation_row)
            relation_row = self.table_metadata4.next_relation()

        expected_col_badge_rel1 = {
            'END_KEY': 'col-badge1',
            'START_LABEL': 'Column',
            'END_LABEL': 'Badge',
            'START_KEY': 'hive://gold.test_schema4/test_table4/test_id1',
            'TYPE': 'HAS_BADGE',
            'REVERSE_TYPE': 'BADGE_FOR'
        }
        expected_col_badge_rel2 = {
            'END_KEY': 'col-badge2',
            'START_LABEL': 'Column',
            'END_LABEL': 'Badge',
            'START_KEY': 'hive://gold.test_schema4/test_table4/test_id1',
            'TYPE': 'HAS_BADGE',
            'REVERSE_TYPE': 'BADGE_FOR'
        }

        self.assertEqual(actual[4], expected_col_badge_rel1)
        self.assertEqual(actual[5], expected_col_badge_rel2)
Exemplo n.º 7
0
    def _load_csv(self) -> None:
        """
        Create an iterator to execute sql.
        """

        with open(self.column_file_location, 'r') as fin:
            self.columns = [dict(i) for i in csv.DictReader(fin)]

        parsed_columns = defaultdict(list)
        for column_dict in self.columns:
            db = column_dict['database']
            cluster = column_dict['cluster']
            schema = column_dict['schema']
            table_name = column_dict['table_name']
            id = self._get_key(db, cluster, schema, table_name)
            column = ColumnMetadata(name=column_dict['name'],
                                    description=column_dict['description'],
                                    col_type=column_dict['col_type'],
                                    sort_order=int(column_dict['sort_order']))
            parsed_columns[id].append(column)

        # Create Table Dictionary
        with open(self.table_file_location, 'r') as fin:
            tables = [dict(i) for i in csv.DictReader(fin)]

        results = []
        for table_dict in tables:
            db = table_dict['database']
            cluster = table_dict['cluster']
            schema = table_dict['schema']
            table_name = table_dict['name']
            id = self._get_key(db, cluster, schema, table_name)
            columns = parsed_columns[id]
            if columns is None:
                columns = []
            table = TableMetadata(
                database=table_dict['database'],
                cluster=table_dict['cluster'],
                schema=table_dict['schema'],
                name=table_dict['name'],
                description=table_dict['description'],
                columns=columns,
                # TODO: this possibly should parse stringified booleans;
                # right now it only will be false for empty strings
                is_view=bool(table_dict['is_view']),
                tags=table_dict['tags'])
            results.append(table)
        self._iter = iter(results)
Exemplo n.º 8
0
    def test_tags_populated_from_str(self) -> None:
        self.table_metadata5 = TableMetadata(
            'hive',
            'gold',
            'test_schema5',
            'test_table5',
            'test_table5', [
                ColumnMetadata('test_id1', 'description of test_table1',
                               'bigint', 0)
            ],
            tags="tag3, tag4")

        # Test table tag field populated from str
        node_row = self.table_metadata5.next_node()
        actual = []
        while node_row:
            actual.append(node_row)
            node_row = self.table_metadata5.next_node()

        self.assertEqual(actual[2].get('LABEL'), 'Tag')
        self.assertEqual(actual[2].get('KEY'), 'tag3')
        self.assertEqual(actual[3].get('KEY'), 'tag4')

        relation_row = self.table_metadata5.next_relation()
        actual = []
        while relation_row:
            actual.append(relation_row)
            relation_row = self.table_metadata5.next_relation()

        # Table tag relationship
        expected_tab_tag_rel3 = {
            'END_KEY': 'tag3',
            'START_LABEL': 'Table',
            'END_LABEL': 'Tag',
            'START_KEY': 'hive://gold.test_schema5/test_table5',
            'TYPE': 'TAGGED_BY',
            'REVERSE_TYPE': 'TAG'
        }
        expected_tab_tag_rel4 = {
            'END_KEY': 'tag4',
            'START_LABEL': 'Table',
            'END_LABEL': 'Tag',
            'START_KEY': 'hive://gold.test_schema5/test_table5',
            'TYPE': 'TAGGED_BY',
            'REVERSE_TYPE': 'TAG'
        }
        self.assertEqual(actual[2], expected_tab_tag_rel3)
        self.assertEqual(actual[3], expected_tab_tag_rel4)
Exemplo n.º 9
0
    def test_extraction_with_single_result(self):
        # type: () -> None
        with patch.object(GlueExtractor, '_search_tables') as mock_search:
            mock_search.return_value = [{
                'Name': 'test_table',
                'DatabaseName': 'test_schema',
                'Description': 'a table for testing',
                'StorageDescriptor': {
                    'Columns': [{
                        'Name': 'col_id1',
                        'Type': 'bigint',
                        'Comment': 'description of id1'
                    }, {
                        'Name': 'col_id2',
                        'Type': 'bigint',
                        'Comment': 'description of id2'
                    }, {
                        'Name': 'is_active',
                        'Type': 'boolean'
                    }, {
                        'Name': 'source',
                        'Type': 'varchar',
                        'Comment': 'description of source'
                    }, {
                        'Name': 'etl_created_at',
                        'Type': 'timestamp',
                        'Comment': 'description of etl_created_at'
                    }, {
                        'Name': 'ds',
                        'Type': 'varchar'
                    }]
                }
            }]

            extractor = GlueExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = TableMetadata(
                'glue', 'gold', 'test_schema', 'test_table',
                'a table for testing', [
                    ColumnMetadata('col_id1', 'description of id1', 'bigint',
                                   0),
                    ColumnMetadata('col_id2', 'description of id2', 'bigint',
                                   1),
                    ColumnMetadata('is_active', None, 'boolean', 2),
                    ColumnMetadata('source', 'description of source',
                                   'varchar', 3),
                    ColumnMetadata('etl_created_at',
                                   'description of etl_created_at',
                                   'timestamp', 4),
                    ColumnMetadata('ds', None, 'varchar', 5)
                ])
            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
Exemplo n.º 10
0
    def setUp(self) -> None:
        super(TestQueryExecution, self).setUp()
        # Display full diffs
        self.maxDiff = None
        self.table_metadata = TableMetadata(
            'hive', 'gold', 'test_schema1', 'test_table1', 'test_table1', [
                ColumnMetadata('field', '', '', 0),
            ])
        self.query_metadata = QueryMetadata(
            sql="select * from table a where a.field > 3",
            tables=[self.table_metadata])

        self.query_join_metadata = QueryExecutionsMetadata(
            query_metadata=self.query_metadata,
            start_time=10,
            execution_count=7)
        self._expected_key = '748c28f86de411b1d2b9deb6ae105eba-10'
    def test_tags_field(self):
        # type: () -> None
        self.table_metadata4 = TableMetadata('hive', 'gold', 'test_schema4', 'test_table4', 'test_table4', [
            ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0, ['col-tag1', 'col-tag2'])],
            is_view=False, tags=['tag1', 'tag2'], attr1='uri', attr2='attr2')

        node_row = self.table_metadata4.next_node()
        actual = []
        while node_row:
            actual.append(node_row)
            node_row = self.table_metadata4.next_node()

        self.assertEqual(actual[0].get('attr1'), 'uri')
        self.assertEqual(actual[0].get('attr2'), 'attr2')

        self.assertEqual(actual[2].get('LABEL'), 'Tag')
        self.assertEqual(actual[2].get('KEY'), 'tag1')
        self.assertEqual(actual[3].get('KEY'), 'tag2')
        self.assertEqual(actual[6].get('KEY'), 'col-tag1')
        self.assertEqual(actual[7].get('KEY'), 'col-tag2')

        relation_row = self.table_metadata4.next_relation()
        actual = []
        while relation_row:
            actual.append(relation_row)
            relation_row = self.table_metadata4.next_relation()

        # Table tag relationship
        expected_tab_tag_rel1 = {'END_KEY': 'tag1', 'START_LABEL': 'Table', 'END_LABEL':
                                 'Tag', 'START_KEY': 'hive://gold.test_schema4/test_table4',
                                 'TYPE': 'TAGGED_BY', 'REVERSE_TYPE': 'TAG'}
        expected_tab_tag_rel2 = {'END_KEY': 'tag2', 'START_LABEL': 'Table',
                                 'END_LABEL': 'Tag', 'START_KEY': 'hive://gold.test_schema4/test_table4',
                                 'TYPE': 'TAGGED_BY', 'REVERSE_TYPE': 'TAG'}
        expected_col_tag_rel1 = {'END_KEY': 'col-tag1', 'START_LABEL': 'Table',
                                 'END_LABEL': 'Tag',
                                 'START_KEY': 'hive://gold.test_schema4/test_table4',
                                 'TYPE': 'TAGGED_BY', 'REVERSE_TYPE': 'TAG'}
        expected_col_tag_rel2 = {'END_KEY': 'col-tag2', 'START_LABEL': 'Table',
                                 'END_LABEL': 'Tag',
                                 'START_KEY': 'hive://gold.test_schema4/test_table4',
                                 'TYPE': 'TAGGED_BY', 'REVERSE_TYPE': 'TAG'}
        self.assertEqual(actual[2], expected_tab_tag_rel1)
        self.assertEqual(actual[3], expected_tab_tag_rel2)
        self.assertEqual(actual[6], expected_col_tag_rel1)
        self.assertEqual(actual[7], expected_col_tag_rel2)
    def _load_csv(self):
        # type: () -> None
        """
        Create an iterator to execute sql.
        """

        with open(self.column_file_location, 'r') as fin:
            self.columns = [dict(i) for i in csv.DictReader(fin)]

        parsed_columns = defaultdict(list)
        for column_dict in self.columns:
            db = column_dict['database']
            cluster = column_dict['cluster']
            schema = column_dict['schema']
            table = column_dict['table_name']
            id = self._get_key(db, cluster, schema, table)
            column = ColumnMetadata(name=column_dict['name'],
                                    description=column_dict['description'],
                                    col_type=column_dict['col_type'],
                                    sort_order=int(column_dict['sort_order']))
            parsed_columns[id].append(column)

        # Create Table Dictionary
        with open(self.table_file_location, 'r') as fin:
            tables = [dict(i) for i in csv.DictReader(fin)]

        results = []
        for table_dict in tables:
            db = table_dict['database']
            cluster = table_dict['cluster']
            schema = table_dict['schema']
            table = table_dict['name']
            id = self._get_key(db, cluster, schema, table)
            columns = parsed_columns[id]
            if columns is None:
                columns = []
            table = TableMetadata(database=table_dict['database'],
                                  cluster=table_dict['cluster'],
                                  schema=table_dict['schema'],
                                  name=table_dict['name'],
                                  description=table_dict['description'],
                                  columns=columns,
                                  is_view=table_dict['is_view'],
                                  tags=table_dict['tags'])
            results.append(table)
        self._iter = iter(results)
Exemplo n.º 13
0
    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """
        for key, group in groupby(self._get_raw_extract_iter(),
                                  self._get_table_key):
            columns = []

            for row in group:
                last_row = row
                columns.append(
                    ColumnMetadata(row['col_name'], row['col_description'],
                                   row['col_type'], row['col_sort_order']))

            yield TableMetadata(self._database, last_row['cluster'],
                                last_row['schema'], last_row['name'],
                                last_row['description'], columns)
    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        '''
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        '''
        for _, group in groupby(self._get_raw_extract_iter(),
                                self._get_table_key):
            columns = []

            for row in group:
                last_row = row
                columns.append(
                    ColumnMetadata(row['col_name'], row['col_description'],
                                   row['col_type'], row['col_sort_order']))

            yield TableMetadata(
                last_row['database'], last_row['cluster'], last_row['schema'],
                last_row['name'], last_row['description'], columns,
                last_row['is_view'] == 'true', last_row['tags'].split(
                    self._tags_separator) if last_row['tags'] else None)
Exemplo n.º 15
0
    def _get_extract_iter(self):
        # type: () -> Iterator[TableMetadata]
        """
        It gets all tables and yields TableMetadata
        :return:
        """
        for row in self._get_raw_extract_iter():
            columns = []

            for i in range(len(row['StorageDescriptor']['Columns'])):
                column = row['StorageDescriptor']['Columns'][i]
                columns.append(
                    ColumnMetadata(
                        column['Name'],
                        column['Comment'] if 'Comment' in column else None,
                        column['Type'], i))

            yield TableMetadata(
                'glue', self._cluster, row['DatabaseName'], row['Name'],
                row['Description'] if 'Description' in row else None, columns)
Exemplo n.º 16
0
    def _extract_topic_data(self) -> Iterator[TableMetadata]:

        for subject in self.subjects:
            ## not handling versions because cba
            s = requests.get(
                f"http://localhost:8081/subjects/{subject}/versions/1").json()
            schema = json.loads(s['schema'])
            fields, i = [], 0

            ## make this recursive

            for key in schema['fields']:
                fields.append(
                    ColumnMetadata(key['name'].replace("-", "_"), 'a comment',
                                   str(key['type']), i))
                i += 1

            ## need a kafka meta data model
            yield TableMetadata('kafka2', 'gold', 'test_schema',
                                subject.replace("-", "_"), 'description',
                                fields, True, ["a tag", "second tag"])
 def _get_extract_iter(self):
     # type: () -> Iterator[TableMetadata]
     """
     Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
     :return:
     """
     for key, group in groupby(self._get_raw_extract_iter(), self._get_table_key):
         columns = []
         # no table description and column description
         for row in group:
             last_row = row
             columns.append(ColumnMetadata(name=row['col_name'],
                                           description='',
                                           col_type=row['col_type'],
                                           sort_order=row['col_sort_order']))
         yield TableMetadata(database='druid',
                             cluster=self._cluster,
                             schema=last_row['schema'],
                             name=last_row['name'],
                             description='',
                             columns=columns)
Exemplo n.º 18
0
    def test(self):
        # type: () -> None
        config = ConfigFactory.from_dict({
            SqlToTblColUsageTransformer.DATABASE_NAME:
            'database',
            SqlToTblColUsageTransformer.USER_EMAIL_ATTRIBUTE_NAME:
            'email',
            SqlToTblColUsageTransformer.SQL_STATEMENT_ATTRIBUTE_NAME:
            'statement'
        })

        with patch.object(HiveTableMetadataExtractor, 'extract') as mock_extract,\
                patch.object(HiveTableMetadataExtractor, 'init'):
            mock_extract.side_effect = [
                TableMetadata(
                    'hive', 'gold', 'test_schema1', 'test_table1',
                    'test_table1', [
                        ColumnMetadata('test_id1',
                                       'description of test_table1', 'bigint',
                                       0),
                        ColumnMetadata('test_id2', 'description of test_id2',
                                       'bigint', 1),
                        ColumnMetadata('is_active', None, 'boolean', 2),
                        ColumnMetadata('source', 'description of source',
                                       'varchar', 3),
                        ColumnMetadata('etl_created_at',
                                       'description of etl_created_at',
                                       'timestamp', 4),
                        ColumnMetadata('ds', None, 'varchar', 5)
                    ]), None
            ]

            transformer = SqlToTblColUsageTransformer()
            transformer.init(config)
            foo = Foo(email='*****@*****.**',
                      statement='SELECT foo, bar FROM test_table1')

            actual = transformer.transform(foo)
            expected = TableColumnUsage(col_readers=[
                ColumnReader(database=u'database',
                             cluster=u'gold',
                             schema='test_schema1',
                             table='test_table1',
                             column='*',
                             user_email='*****@*****.**')
            ])
            self.assertEqual(expected.__repr__(), actual.__repr__())
    def _get_extract_iter(self):
        # type: () -> Iterator[TableMetadata]
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """

        for key, group in groupby(self._get_raw_extract_iter(),
                                  self._get_table_key):
            columns = []

            for row in group:
                last_row = row
                columns.append(
                    ColumnMetadata(
                        row['col_name'], row['extras'] if row['extras']
                        is not None else row['col_description'],
                        row['col_type'], row['col_sort_order']))

            yield TableMetadata('athena', last_row['cluster'],
                                last_row['schema_name'], last_row['name'], '',
                                columns)
Exemplo n.º 20
0
 def _extract_table_metadata(self, object_name: str,
                             data: Dict[str, Any]) -> TableMetadata:
     # sort the fields by name because Amundsen requires a sort order for the columns and I did
     # not see one in the response
     fields = sorted(data["fields"], key=lambda x: x["name"])
     columns = [
         ColumnMetadata(
             name=f["name"],
             description=f["inlineHelpText"],
             col_type=f["type"],
             sort_order=i,
         ) for i, f in enumerate(fields)
     ]
     return TableMetadata(
         database=self._database,
         cluster=self._cluster,
         schema=self._schema,
         name=object_name,
         # TODO: Can we extract table description / does it exist?
         description=None,
         columns=columns,
     )
    def test_table_attributes(self):
        # type: () -> None
        self.table_metadata3 = TableMetadata('hive', 'gold', 'test_schema3', 'test_table3', 'test_table3', [
            ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0),
            ColumnMetadata('test_id2', 'description of test_id2', 'bigint', 1),
            ColumnMetadata('is_active', None, 'boolean', 2),
            ColumnMetadata('source', 'description of source', 'varchar', 3),
            ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4),
            ColumnMetadata('ds', None, 'varchar', 5)], is_view=False, attr1='uri', attr2='attr2')

        node_row = self.table_metadata3.next_node()
        actual = []
        while node_row:
            actual.append(node_row)
            node_row = self.table_metadata3.next_node()

        self.assertEqual(actual[0].get('attr1'), 'uri')
        self.assertEqual(actual[0].get('attr2'), 'attr2')
    def _get_column_metadata(self,
                             view_original_text: str) -> List[ColumnMetadata]:
        """
        Get Column Metadata from VIEW_ORIGINAL_TEXT from TBLS table for Presto Views.
        Columns are sorted the same way as they appear in Presto Create View SQL.
        :param view_original_text:
        :return:
        """
        # remove encoded Presto View data prefix and suffix
        encoded_view_info = (
            view_original_text.
            split(PrestoViewMetadataExtractor.PRESTO_VIEW_PREFIX, 1)[-1].
            rsplit(PrestoViewMetadataExtractor.PRESTO_VIEW_SUFFIX, 1)[0]
        )

        # view_original_text is b64 encoded:
        # https://github.com/prestodb/presto/blob/43bd519052ba4c56ff1f4fc807075637ab5f4f10/presto-hive/src/main/java/com/facebook/presto/hive/HiveUtil.java#L602-L605
        decoded_view_info = base64.b64decode(encoded_view_info)
        columns = json.loads(decoded_view_info).get('columns')

        return [ColumnMetadata(name=column['name'],
                               description=None,
                               col_type=column['type'],
                               sort_order=i) for i, column in enumerate(columns)]
    def _get_extract_iter(self):
        # type: () -> Iterator[TableMetadata]
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """
        for key, group in groupby(self._get_raw_extract_iter(),
                                  self._get_table_key):
            columns = []

            for row in group:
                last_row = row
                columns.append(
                    ColumnMetadata(
                        row['col_name'],
                        unidecode(row['col_description'])
                        if row['col_description'] else None, row['col_type'],
                        row['col_sort_order']))

            yield TableMetadata(
                self._database, last_row['cluster'], last_row['schema_name'],
                last_row['name'],
                unidecode(last_row['description']) if last_row['description']
                else None, columns, last_row['is_view'] == 'true')
Exemplo n.º 24
0
    def test_extraction_with_resource_link_result(self) -> None:
        with patch.object(GlueExtractor, '_search_tables') as mock_search:
            mock_search.return_value = [
                test_table, {
                    "Name": "test_resource_link",
                    "DatabaseName": "test_schema",
                    "TargetTable": {
                        "CatalogId": "111111111111",
                        "DatabaseName": "test_schema_external",
                        "Name": "test_table"
                    },
                    "CatalogId": "222222222222"
                }
            ]

            extractor = GlueExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = TableMetadata(
                'glue', 'gold', 'test_schema', 'test_table',
                'a table for testing', [
                    ColumnMetadata('col_id1', 'description of id1', 'bigint',
                                   0),
                    ColumnMetadata('col_id2', 'description of id2', 'bigint',
                                   1),
                    ColumnMetadata('is_active', None, 'boolean', 2),
                    ColumnMetadata('source', 'description of source',
                                   'varchar', 3),
                    ColumnMetadata('etl_created_at',
                                   'description of etl_created_at',
                                   'timestamp', 4),
                    ColumnMetadata('ds', None, 'varchar', 5),
                    ColumnMetadata('partition_key1',
                                   'description of partition_key1', 'string',
                                   6),
                ], False)
            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
Exemplo n.º 25
0
    def test_z_custom_sources(self) -> None:
        self.custom_source = TableMetadata('hive', 'gold', 'test_schema3', 'test_table4', 'test_table4', [
            ColumnMetadata('test_id1', 'description of test_table1', 'bigint', 0),
            ColumnMetadata('test_id2', 'description of test_id2', 'bigint', 1),
            ColumnMetadata('is_active', None, 'boolean', 2),
            ColumnMetadata('source', 'description of source', 'varchar', 3),
            ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp', 4),
            ColumnMetadata('ds', None, 'varchar', 5)], is_view=False, description_source="custom")

        node_row = self.custom_source.next_node()
        actual = []
        while node_row:
            node_row_serialized = neo4_serializer.serialize_node(node_row)
            actual.append(node_row_serialized)
            node_row = self.custom_source.next_node()
        expected = {'LABEL': 'Programmatic_Description',
                    'KEY': 'hive://gold.test_schema3/test_table4/_custom_description',
                    'description_source': 'custom', 'description': 'test_table4'}
        self.assertEqual(actual[1], expected)
    def test_serialize(self):
        # type: () -> None
        self.table_metadata = TableMetadata(
            'hive', 'gold', 'test_schema1', 'test_table1', 'test_table1', [
                ColumnMetadata('test_id1', 'description of test_table1',
                               'bigint', 0),
                ColumnMetadata('test_id2', 'description of test_id2', 'bigint',
                               1),
                ColumnMetadata('is_active', None, 'boolean', 2),
                ColumnMetadata('source', 'description of source', 'varchar',
                               3),
                ColumnMetadata('etl_created_at',
                               'description of etl_created_at', 'timestamp',
                               4),
                ColumnMetadata('ds', None, 'varchar', 5)
            ])

        self.table_metadata2 = TableMetadata(
            'hive', 'gold', 'test_schema1', 'test_table1', 'test_table1', [
                ColumnMetadata('test_id1', 'description of test_table1',
                               'bigint', 0),
                ColumnMetadata('test_id2', 'description of test_id2', 'bigint',
                               1),
                ColumnMetadata('is_active', None, 'boolean', 2),
                ColumnMetadata('source', 'description of source', 'varchar',
                               3),
                ColumnMetadata('etl_created_at',
                               'description of etl_created_at', 'timestamp',
                               4),
                ColumnMetadata('ds', None, 'varchar', 5)
            ])

        self.expected_nodes_deduped = [{
            'name': 'test_table1',
            'KEY': 'hive://gold.test_schema1/test_table1',
            'LABEL': 'Table',
            'is_view:UNQUOTED': False
        }, {
            'description': 'test_table1',
            'KEY': 'hive://gold.test_schema1/test_table1/_description',
            'LABEL': 'Description'
        }, {
            'sort_order:UNQUOTED': 0,
            'type': 'bigint',
            'name': 'test_id1',
            'KEY': 'hive://gold.test_schema1/test_table1/test_id1',
            'LABEL': 'Column'
        }, {
            'description': 'description of test_table1',
            'KEY':
            'hive://gold.test_schema1/test_table1/test_id1/_description',
            'LABEL': 'Description'
        }, {
            'sort_order:UNQUOTED': 1,
            'type': 'bigint',
            'name': 'test_id2',
            'KEY': 'hive://gold.test_schema1/test_table1/test_id2',
            'LABEL': 'Column'
        }, {
            'description': 'description of test_id2',
            'KEY':
            'hive://gold.test_schema1/test_table1/test_id2/_description',
            'LABEL': 'Description'
        }, {
            'sort_order:UNQUOTED': 2,
            'type': 'boolean',
            'name': 'is_active',
            'KEY': 'hive://gold.test_schema1/test_table1/is_active',
            'LABEL': 'Column'
        }, {
            'sort_order:UNQUOTED': 3,
            'type': 'varchar',
            'name': 'source',
            'KEY': 'hive://gold.test_schema1/test_table1/source',
            'LABEL': 'Column'
        }, {
            'description': 'description of source',
            'KEY': 'hive://gold.test_schema1/test_table1/source/_description',
            'LABEL': 'Description'
        }, {
            'sort_order:UNQUOTED': 4,
            'type': 'timestamp',
            'name': 'etl_created_at',
            'KEY': 'hive://gold.test_schema1/test_table1/etl_created_at',
            'LABEL': 'Column'
        }, {
            'description': 'description of etl_created_at',
            'KEY':
            'hive://gold.test_schema1/test_table1/etl_created_at/_description',
            'LABEL': 'Description'
        }, {
            'sort_order:UNQUOTED': 5,
            'type': 'varchar',
            'name': 'ds',
            'KEY': 'hive://gold.test_schema1/test_table1/ds',
            'LABEL': 'Column'
        }]

        self.expected_nodes = copy.deepcopy(self.expected_nodes_deduped)
        self.expected_nodes.append({
            'name': 'hive',
            'KEY': 'database://hive',
            'LABEL': 'Database'
        })
        self.expected_nodes.append({
            'name': 'gold',
            'KEY': 'hive://gold',
            'LABEL': 'Cluster'
        })
        self.expected_nodes.append({
            'name': 'test_schema1',
            'KEY': 'hive://gold.test_schema1',
            'LABEL': 'Schema'
        })

        self.expected_rels_deduped = [{
            'END_KEY': 'hive://gold.test_schema1/test_table1',
            'START_LABEL': 'Schema',
            'END_LABEL': 'Table',
            'START_KEY': 'hive://gold.test_schema1',
            'TYPE': 'TABLE',
            'REVERSE_TYPE': 'TABLE_OF'
        }, {
            'END_KEY': 'hive://gold.test_schema1/test_table1/_description',
            'START_LABEL': 'Table',
            'END_LABEL': 'Description',
            'START_KEY': 'hive://gold.test_schema1/test_table1',
            'TYPE': 'DESCRIPTION',
            'REVERSE_TYPE': 'DESCRIPTION_OF'
        }, {
            'END_KEY': 'hive://gold.test_schema1/test_table1/test_id1',
            'START_LABEL': 'Table',
            'END_LABEL': 'Column',
            'START_KEY': 'hive://gold.test_schema1/test_table1',
            'TYPE': 'COLUMN',
            'REVERSE_TYPE': 'COLUMN_OF'
        }, {
            'END_KEY':
            'hive://gold.test_schema1/test_table1/test_id1/_description',
            'START_LABEL': 'Column',
            'END_LABEL': 'Description',
            'START_KEY': 'hive://gold.test_schema1/test_table1/test_id1',
            'TYPE': 'DESCRIPTION',
            'REVERSE_TYPE': 'DESCRIPTION_OF'
        }, {
            'END_KEY': 'hive://gold.test_schema1/test_table1/test_id2',
            'START_LABEL': 'Table',
            'END_LABEL': 'Column',
            'START_KEY': 'hive://gold.test_schema1/test_table1',
            'TYPE': 'COLUMN',
            'REVERSE_TYPE': 'COLUMN_OF'
        }, {
            'END_KEY':
            'hive://gold.test_schema1/test_table1/test_id2/_description',
            'START_LABEL': 'Column',
            'END_LABEL': 'Description',
            'START_KEY': 'hive://gold.test_schema1/test_table1/test_id2',
            'TYPE': 'DESCRIPTION',
            'REVERSE_TYPE': 'DESCRIPTION_OF'
        }, {
            'END_KEY': 'hive://gold.test_schema1/test_table1/is_active',
            'START_LABEL': 'Table',
            'END_LABEL': 'Column',
            'START_KEY': 'hive://gold.test_schema1/test_table1',
            'TYPE': 'COLUMN',
            'REVERSE_TYPE': 'COLUMN_OF'
        }, {
            'END_KEY': 'hive://gold.test_schema1/test_table1/source',
            'START_LABEL': 'Table',
            'END_LABEL': 'Column',
            'START_KEY': 'hive://gold.test_schema1/test_table1',
            'TYPE': 'COLUMN',
            'REVERSE_TYPE': 'COLUMN_OF'
        }, {
            'END_KEY':
            'hive://gold.test_schema1/test_table1/source/_description',
            'START_LABEL': 'Column',
            'END_LABEL': 'Description',
            'START_KEY': 'hive://gold.test_schema1/test_table1/source',
            'TYPE': 'DESCRIPTION',
            'REVERSE_TYPE': 'DESCRIPTION_OF'
        }, {
            'END_KEY': 'hive://gold.test_schema1/test_table1/etl_created_at',
            'START_LABEL': 'Table',
            'END_LABEL': 'Column',
            'START_KEY': 'hive://gold.test_schema1/test_table1',
            'TYPE': 'COLUMN',
            'REVERSE_TYPE': 'COLUMN_OF'
        }, {
            'END_KEY':
            'hive://gold.test_schema1/test_table1/etl_created_at/_description',
            'START_LABEL': 'Column',
            'END_LABEL': 'Description',
            'START_KEY': 'hive://gold.test_schema1/test_table1/etl_created_at',
            'TYPE': 'DESCRIPTION',
            'REVERSE_TYPE': 'DESCRIPTION_OF'
        }, {
            'END_KEY': 'hive://gold.test_schema1/test_table1/ds',
            'START_LABEL': 'Table',
            'END_LABEL': 'Column',
            'START_KEY': 'hive://gold.test_schema1/test_table1',
            'TYPE': 'COLUMN',
            'REVERSE_TYPE': 'COLUMN_OF'
        }]

        self.expected_rels = copy.deepcopy(self.expected_rels_deduped)
        self.expected_rels.append({
            'END_KEY': 'hive://gold',
            'START_LABEL': 'Database',
            'END_LABEL': 'Cluster',
            'START_KEY': 'database://hive',
            'TYPE': 'CLUSTER',
            'REVERSE_TYPE': 'CLUSTER_OF'
        })
        self.expected_rels.append({
            'END_KEY': 'hive://gold.test_schema1',
            'START_LABEL': 'Cluster',
            'END_LABEL': 'Schema',
            'START_KEY': 'hive://gold',
            'TYPE': 'SCHEMA',
            'REVERSE_TYPE': 'SCHEMA_OF'
        })

        node_row = self.table_metadata.next_node()
        actual = []
        while node_row:
            actual.append(node_row)
            node_row = self.table_metadata.next_node()

        self.assertEqual(self.expected_nodes, actual)

        relation_row = self.table_metadata.next_relation()
        actual = []
        while relation_row:
            actual.append(relation_row)
            relation_row = self.table_metadata.next_relation()

        self.assertEqual(self.expected_rels, actual)

        # 2nd record should not show already serialized database, cluster, and schema
        node_row = self.table_metadata2.next_node()
        actual = []
        while node_row:
            actual.append(node_row)
            node_row = self.table_metadata2.next_node()

        self.assertEqual(self.expected_nodes_deduped, actual)

        relation_row = self.table_metadata2.next_relation()
        actual = []
        while relation_row:
            actual.append(relation_row)
            relation_row = self.table_metadata2.next_relation()

        self.assertEqual(self.expected_rels_deduped, actual)
    def test_extraction_with_multiple_result(self):
        # type: () -> None
        with patch.object(SQLAlchemyExtractor,
                          '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute
            table = {
                'schema_name':
                'test_schema1',
                'name':
                'test_table1',
                'description':
                'test table 1',
                'cluster':
                self.conf['extractor.mssql_metadata.{}'.format(
                    MSSQLMetadataExtractor.CLUSTER_KEY)]
            }

            table1 = {
                'schema_name':
                'test_schema1',
                'name':
                'test_table2',
                'description':
                'test table 2',
                'cluster':
                self.conf['extractor.mssql_metadata.{}'.format(
                    MSSQLMetadataExtractor.CLUSTER_KEY)]
            }

            table2 = {
                'schema_name':
                'test_schema2',
                'name':
                'test_table3',
                'description':
                'test table 3',
                'cluster':
                self.conf['extractor.mssql_metadata.{}'.format(
                    MSSQLMetadataExtractor.CLUSTER_KEY)]
            }

            sql_execute.return_value = [
                self._union(
                    {
                        'col_name': 'col_id1',
                        'col_type': 'bigint',
                        'col_description': 'description of col_id1',
                        'col_sort_order': 0
                    }, table),
                self._union(
                    {
                        'col_name': 'col_id2',
                        'col_type': 'bigint',
                        'col_description': 'description of col_id2',
                        'col_sort_order': 1
                    }, table),
                self._union(
                    {
                        'col_name': 'is_active',
                        'col_type': 'boolean',
                        'col_description': None,
                        'col_sort_order': 2
                    }, table),
                self._union(
                    {
                        'col_name': 'source',
                        'col_type': 'varchar',
                        'col_description': 'description of source',
                        'col_sort_order': 3
                    }, table),
                self._union(
                    {
                        'col_name': 'etl_created_at',
                        'col_type': 'timestamp',
                        'col_description': 'description of etl_created_at',
                        'col_sort_order': 4
                    }, table),
                self._union(
                    {
                        'col_name': 'ds',
                        'col_type': 'varchar',
                        'col_description': None,
                        'col_sort_order': 5
                    }, table),
                self._union(
                    {
                        'col_name': 'col_name',
                        'col_type': 'varchar',
                        'col_description': 'description of col_name',
                        'col_sort_order': 0
                    }, table1),
                self._union(
                    {
                        'col_name': 'col_name2',
                        'col_type': 'varchar',
                        'col_description': 'description of col_name2',
                        'col_sort_order': 1
                    }, table1),
                self._union(
                    {
                        'col_name': 'col_id3',
                        'col_type': 'varchar',
                        'col_description': 'description of col_id3',
                        'col_sort_order': 0
                    }, table2),
                self._union(
                    {
                        'col_name': 'col_name3',
                        'col_type': 'varchar',
                        'col_description': 'description of col_name3',
                        'col_sort_order': 1
                    }, table2)
            ]

            extractor = MSSQLMetadataExtractor()
            extractor.init(self.conf)

            expected = TableMetadata(
                'mssql', self.conf['extractor.mssql_metadata.{}'.format(
                    MSSQLMetadataExtractor.CLUSTER_KEY)], 'test_schema1',
                'test_table1', 'test table 1', [
                    ColumnMetadata('col_id1', 'description of col_id1',
                                   'bigint', 0),
                    ColumnMetadata('col_id2', 'description of col_id2',
                                   'bigint', 1),
                    ColumnMetadata('is_active', None, 'boolean', 2),
                    ColumnMetadata('source', 'description of source',
                                   'varchar', 3),
                    ColumnMetadata('etl_created_at',
                                   'description of etl_created_at',
                                   'timestamp', 4),
                    ColumnMetadata('ds', None, 'varchar', 5),
                ], False, ['test_schema1'])

            actual = extractor.extract().__repr__()
            self.assertEqual(expected.__repr__(), actual)

            expected = TableMetadata(
                'mssql', self.conf['extractor.mssql_metadata.{}'.format(
                    MSSQLMetadataExtractor.CLUSTER_KEY)], 'test_schema1',
                'test_table2', 'test table 2', [
                    ColumnMetadata('col_name', 'description of col_name',
                                   'varchar', 0),
                    ColumnMetadata('col_name2', 'description of col_name2',
                                   'varchar', 1)
                ], False, ['test_schema1'])
            actual = extractor.extract().__repr__()

            self.assertEqual(expected.__repr__(), actual)

            expected = TableMetadata(
                'mssql', self.conf['extractor.mssql_metadata.{}'.format(
                    MSSQLMetadataExtractor.CLUSTER_KEY)], 'test_schema2',
                'test_table3', 'test table 3', [
                    ColumnMetadata('col_id3', 'description of col_id3',
                                   'varchar', 0),
                    ColumnMetadata('col_name3', 'description of col_name3',
                                   'varchar', 1)
                ], False, ['test_schema2'])
            actual = extractor.extract().__repr__()
            self.assertEqual(expected.__repr__(), actual)

            self.assertIsNone(extractor.extract())
            self.assertIsNone(extractor.extract())
    def test_extraction_with_multiple_views(self) -> None:
        with patch.object(SQLAlchemyExtractor,
                          '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute

            columns1 = {
                'columns': [{
                    'name': 'xyz',
                    'type': 'varchar'
                }, {
                    'name': 'xyy',
                    'type': 'double'
                }, {
                    'name': 'aaa',
                    'type': 'int'
                }, {
                    'name': 'ab',
                    'type': 'varchar'
                }]
            }

            columns2 = {
                'columns': [{
                    'name': 'xyy',
                    'type': 'varchar'
                }, {
                    'name': 'ab',
                    'type': 'double'
                }, {
                    'name': 'aaa',
                    'type': 'int'
                }, {
                    'name': 'xyz',
                    'type': 'varchar'
                }]
            }

            sql_execute.return_value = [
                {
                    'tbl_id':
                    2,
                    'schema':
                    'test_schema2',
                    'name':
                    'test_view2',
                    'tbl_type':
                    'virtual_view',
                    'view_original_text':
                    base64.b64encode(
                        json.dumps(columns2).encode()).decode("utf-8")
                },
                {
                    'tbl_id':
                    1,
                    'schema':
                    'test_schema1',
                    'name':
                    'test_view1',
                    'tbl_type':
                    'virtual_view',
                    'view_original_text':
                    base64.b64encode(
                        json.dumps(columns1).encode()).decode("utf-8")
                },
            ]

            extractor = PrestoViewMetadataExtractor()
            extractor.init(self.conf)
            actual_first_view = extractor.extract()
            expected_first_view = TableMetadata(
                'presto', 'gold', 'test_schema2', 'test_view2', None, [
                    ColumnMetadata(u'xyy', None, u'varchar', 0),
                    ColumnMetadata(u'ab', None, u'double', 1),
                    ColumnMetadata(u'aaa', None, u'int', 2),
                    ColumnMetadata(u'xyz', None, u'varchar', 3)
                ], True)
            self.assertEqual(expected_first_view.__repr__(),
                             actual_first_view.__repr__())

            actual_second_view = extractor.extract()
            expected_second_view = TableMetadata(
                'presto', 'gold', 'test_schema1', 'test_view1', None, [
                    ColumnMetadata(u'xyz', None, u'varchar', 0),
                    ColumnMetadata(u'xyy', None, u'double', 1),
                    ColumnMetadata(u'aaa', None, u'int', 2),
                    ColumnMetadata(u'ab', None, u'varchar', 3)
                ], True)
            self.assertEqual(expected_second_view.__repr__(),
                             actual_second_view.__repr__())

            self.assertIsNone(extractor.extract())
Exemplo n.º 29
0
    def test_extraction_with_multiple_result(self) -> None:
        with patch.object(GlueExtractor, '_search_tables') as mock_search:
            mock_search.return_value = [
                test_table,
                {
                    'Name': 'test_table2',
                    'DatabaseName': 'test_schema1',
                    'Description': 'test table 2',
                    'StorageDescriptor': {
                        'Columns': [{
                            'Name': 'col_name',
                            'Type': 'varchar',
                            'Comment': 'description of col_name'
                        }, {
                            'Name': 'col_name2',
                            'Type': 'varchar',
                            'Comment': 'description of col_name2'
                        }]
                    },
                    'TableType': 'EXTERNAL_TABLE',
                },
                {
                    'Name': 'test_table3',
                    'DatabaseName': 'test_schema2',
                    'StorageDescriptor': {
                        'Columns': [{
                            'Name': 'col_id3',
                            'Type': 'varchar',
                            'Comment': 'description of col_id3'
                        }, {
                            'Name': 'col_name3',
                            'Type': 'varchar',
                            'Comment': 'description of col_name3'
                        }]
                    },
                    'Parameters': {
                        'comment': 'description of test table 3 from comment'
                    },
                    'TableType': 'EXTERNAL_TABLE',
                },
                {
                    'Name': 'test_view1',
                    'DatabaseName': 'test_schema1',
                    'Description': 'test view 1',
                    'StorageDescriptor': {
                        'Columns': [{
                            'Name': 'col_id3',
                            'Type': 'varchar',
                            'Comment': 'description of col_id3'
                        }, {
                            'Name': 'col_name3',
                            'Type': 'varchar',
                            'Comment': 'description of col_name3'
                        }]
                    },
                    'TableType': 'VIRTUAL_VIEW',
                },
            ]

            extractor = GlueExtractor()
            extractor.init(self.conf)

            expected = TableMetadata(
                'glue', 'gold', 'test_schema', 'test_table',
                'a table for testing', [
                    ColumnMetadata('col_id1', 'description of id1', 'bigint',
                                   0),
                    ColumnMetadata('col_id2', 'description of id2', 'bigint',
                                   1),
                    ColumnMetadata('is_active', None, 'boolean', 2),
                    ColumnMetadata('source', 'description of source',
                                   'varchar', 3),
                    ColumnMetadata('etl_created_at',
                                   'description of etl_created_at',
                                   'timestamp', 4),
                    ColumnMetadata('ds', None, 'varchar', 5),
                    ColumnMetadata('partition_key1',
                                   'description of partition_key1', 'string',
                                   6),
                ], False)
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            expected = TableMetadata(
                'glue', 'gold', 'test_schema1', 'test_table2', 'test table 2',
                [
                    ColumnMetadata('col_name', 'description of col_name',
                                   'varchar', 0),
                    ColumnMetadata('col_name2', 'description of col_name2',
                                   'varchar', 1)
                ], False)
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            expected = TableMetadata(
                'glue', 'gold', 'test_schema2', 'test_table3',
                'description of test table 3 from comment', [
                    ColumnMetadata('col_id3', 'description of col_id3',
                                   'varchar', 0),
                    ColumnMetadata('col_name3', 'description of col_name3',
                                   'varchar', 1)
                ], False)
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            expected = TableMetadata(
                'glue', 'gold', 'test_schema1', 'test_view1', 'test view 1', [
                    ColumnMetadata('col_id3', 'description of col_id3',
                                   'varchar', 0),
                    ColumnMetadata('col_name3', 'description of col_name3',
                                   'varchar', 1)
                ], True)
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            self.assertIsNone(extractor.extract())
            self.assertIsNone(extractor.extract())
    def test_extraction_with_single_result(self, mock_connect: MagicMock) -> None:
        """
        Test Extraction with single table result from query
        """
        mock_connection = MagicMock()
        mock_connect.return_value = mock_connection

        mock_cursor = MagicMock()
        mock_connection.cursor.return_value = mock_cursor

        mock_execute = MagicMock()
        mock_cursor.execute = mock_execute

        mock_cursor.description = [
            ['col_name'],
            ['col_description'],
            ['col_type'],
            ['col_sort_order'],
            ['database'],
            ['cluster'],
            ['schema'],
            ['name'],
            ['description'],
            ['is_view']
        ]

        # Pass flake8 Unsupported operand types for + error
        table: List[Any] = [
            'DREMIO',
            'Production',
            'test_schema',
            'test_table',
            'a table for testing',
            'false'
        ]

        # Pass flake8 Unsupported operand types for + error
        expected_input: List[List[Any]] = [
            ['col_id1', 'description of id1', 'number', 0] + table,
            ['col_id2', 'description of id2', 'number', 1] + table,
            ['is_active', None, 'boolean', 2] + table,
            ['source', 'description of source', 'varchar', 3] + table,
            ['etl_created_at', 'description of etl_created_at', 'timestamp_ltz', 4] + table,
            ['ds', None, 'varchar', 5] + table
        ]

        mock_cursor.execute.return_value = expected_input

        extractor = DremioMetadataExtractor()
        extractor.init(self.conf)

        actual = extractor.extract()
        expected = TableMetadata('DREMIO', 'Production', 'test_schema', 'test_table', 'a table for testing',
                                 [ColumnMetadata('col_id1', 'description of id1', 'number', 0),
                                  ColumnMetadata('col_id2', 'description of id2', 'number', 1),
                                  ColumnMetadata('is_active', None, 'boolean', 2),
                                  ColumnMetadata('source', 'description of source', 'varchar', 3),
                                  ColumnMetadata('etl_created_at', 'description of etl_created_at',
                                                 'timestamp_ltz', 4),
                                  ColumnMetadata('ds', None, 'varchar', 5)])

        self.assertEqual(expected.__repr__(), actual.__repr__())
        self.assertIsNone(extractor.extract())