Пример #1
0
    def test_get_connection(self: Any, mock_method: Any) -> None:
        """
        Test that configs are passed through correctly to the _get_connection method
        """
        extractor = SQLAlchemyExtractor()
        config_dict: Dict[str, Any] = {
            'extractor.sqlalchemy.conn_string': 'TEST_CONNECTION',
            'extractor.sqlalchemy.extract_sql': 'SELECT 1 FROM TEST_TABLE;'
        }
        conf = ConfigFactory.from_dict(config_dict)
        extractor.init(
            Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope()))
        extractor._get_connection()
        mock_method.assert_called_with('TEST_CONNECTION', connect_args={})

        extractor = SQLAlchemyExtractor()
        config_dict = {
            'extractor.sqlalchemy.conn_string': 'TEST_CONNECTION',
            'extractor.sqlalchemy.extract_sql': 'SELECT 1 FROM TEST_TABLE;',
            'extractor.sqlalchemy.connect_args': {
                "protocol": "https"
            },
        }
        conf = ConfigFactory.from_dict(config_dict)
        extractor.init(
            Scoped.get_scoped_conf(conf=conf, scope=extractor.get_scope()))
        extractor._get_connection()
        mock_method.assert_called_with('TEST_CONNECTION',
                                       connect_args={"protocol": "https"})
    def _get_non_partitioned_table_sql_alchemy_extractor(self):
        # type: () -> Extractor
        """
        Getting an SQLAlchemy extractor that extracts storage location for non-partitioned table for further probing
        last updated timestamp

        :return: SQLAlchemyExtractor
        """
        if HiveTableLastUpdatedExtractor.NON_PARTITIONED_TABLE_WHERE_CLAUSE_SUFFIX_KEY in self._conf:
            where_clause_suffix = """
            {}
            AND {}
            """.format(
                self._conf.get_string(
                    HiveTableLastUpdatedExtractor.
                    NON_PARTITIONED_TABLE_WHERE_CLAUSE_SUFFIX_KEY),
                HiveTableLastUpdatedExtractor.ADDTIONAL_WHERE_CLAUSE)
        else:
            where_clause_suffix = 'WHERE {}'.format(
                HiveTableLastUpdatedExtractor.ADDTIONAL_WHERE_CLAUSE)

        sql_stmt = HiveTableLastUpdatedExtractor.NON_PARTITIONED_TABLE_SQL_STATEMENT.format(
            where_clause_suffix=where_clause_suffix)

        LOGGER.info(
            'SQL for non-partitioned table against Hive metastore: {}'.format(
                sql_stmt))

        sql_alchemy_extractor = SQLAlchemyExtractor()
        sql_alchemy_conf = Scoped.get_scoped_conf(self._conf, sql_alchemy_extractor.get_scope()) \
            .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: sql_stmt}))
        sql_alchemy_extractor.init(sql_alchemy_conf)
        return sql_alchemy_extractor
Пример #3
0
    def test_extraction_with_model_class(self: Any, mock_method: Any) -> None:
        """
        Test Extraction using model class
        """
        config_dict = {
            'extractor.sqlalchemy.conn_string':
            'TEST_CONNECTION',
            'extractor.sqlalchemy.extract_sql':
            'SELECT 1 FROM TEST_TABLE;',
            'extractor.sqlalchemy.model_class':
            'tests.unit.extractor.test_sql_alchemy_extractor.TableMetadataResult'
        }
        self.conf = ConfigFactory.from_dict(config_dict)

        extractor = SQLAlchemyExtractor()
        extractor.results = [
            dict(database='test_database',
                 schema='test_schema',
                 name='test_table',
                 description='test_description',
                 column_name='test_column_name',
                 column_type='test_column_type',
                 column_comment='test_column_comment',
                 owner='test_owner')
        ]

        extractor.init(
            Scoped.get_scoped_conf(conf=self.conf,
                                   scope=extractor.get_scope()))

        result = extractor.extract()

        self.assertIsInstance(result, TableMetadataResult)
        self.assertEqual(result.name, 'test_table')
 def test_extraction_with_single_query_result(self, mock_method):
     # type: (Any, Any) -> None
     """
     Test Extraction from single result from query
     """
     extractor = SQLAlchemyExtractor()
     extractor.results = [('test_result'), ]
     extractor.init(Scoped.get_scoped_conf(conf=self.conf,
                                           scope=extractor.get_scope()))
     results = extractor.extract()
     self.assertEqual(results, 'test_result')
Пример #5
0
    def test_extraction_with_empty_query_result(self: Any,
                                                mock_method: Any) -> None:
        """
        Test Extraction with empty result from query
        """
        extractor = SQLAlchemyExtractor()
        extractor.results = ['']
        extractor.init(
            Scoped.get_scoped_conf(conf=self.conf,
                                   scope=extractor.get_scope()))

        results = extractor.extract()
        self.assertEqual(results, '')
    def test_extraction_with_multiple_query_result(self, mock_method):
        # type: (Any, Any) -> None
        """
        Test Extraction from list of results from query
        """
        extractor = SQLAlchemyExtractor()
        extractor.results = ['test_result', 'test_result2', 'test_result3']
        extractor.init(Scoped.get_scoped_conf(conf=self.conf,
                                              scope=extractor.get_scope()))
        result = [extractor.extract() for _ in range(3)]

        self.assertEqual(len(result), 3)
        self.assertEqual(result,
                         ['test_result', 'test_result2', 'test_result3'])
    def _get_partitioned_table_sql_alchemy_extractor(self) -> Extractor:
        """
        Getting an SQLAlchemy extractor that extracts last updated timestamp for partitioned table.
        :return: SQLAlchemyExtractor
        """

        sql_stmt = HiveTableLastUpdatedExtractor.PARTITION_TABLE_SQL_STATEMENT.format(
            where_clause_suffix=self._conf.get_string(
                HiveTableLastUpdatedExtractor.PARTITIONED_TABLE_WHERE_CLAUSE_SUFFIX_KEY, ' '))

        LOGGER.info('SQL for partitioned table against Hive metastore: %s', sql_stmt)

        sql_alchemy_extractor = SQLAlchemyExtractor()
        sql_alchemy_conf = Scoped.get_scoped_conf(self._conf, sql_alchemy_extractor.get_scope()) \
            .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: sql_stmt}))
        sql_alchemy_extractor.init(sql_alchemy_conf)
        return sql_alchemy_extractor
Пример #8
0
class VerticaMetadataExtractor(Extractor):
    """
    Extracts vertica table and column metadata from underlying meta store database using SQLAlchemyExtractor
    V_CATALOG does not have table and column description columns
    CLUSTER_KEY config parameter is used as cluster name
    Not distinguishing between table & view for now
    """
    # SELECT statement from vertica information_schema to extract table and column metadata
    SQL_STATEMENT = """
        SELECT
        lower(c.column_name) AS col_name,
        lower(c.data_type) AS col_type,
        c.ordinal_position AS col_sort_order,
        {cluster_source} AS cluster,
        lower(c.table_schema) AS "schema",
        lower(c.table_name) AS name,
        False AS is_view
        FROM
        V_CATALOG.COLUMNS AS c
        LEFT JOIN
        V_CATALOG.TABLES t
            ON c.TABLE_NAME = t.TABLE_NAME
            AND c.TABLE_SCHEMA = t.TABLE_SCHEMA
        {where_clause_suffix}
        ORDER by cluster, "schema", name, col_sort_order ;
    """

    # CONFIG KEYS
    WHERE_CLAUSE_SUFFIX_KEY = 'where_clause_suffix'
    CLUSTER_KEY = 'cluster_key'
    USE_CATALOG_AS_CLUSTER_NAME = 'use_catalog_as_cluster_name'
    DATABASE_KEY = 'database_key'

    # Default values
    DEFAULT_CLUSTER_NAME = 'master'

    DEFAULT_CONFIG = ConfigFactory.from_dict(
        {WHERE_CLAUSE_SUFFIX_KEY: ' ', CLUSTER_KEY: DEFAULT_CLUSTER_NAME, USE_CATALOG_AS_CLUSTER_NAME: False}
    )

    def init(self, conf: ConfigTree) -> None:
        conf = conf.with_fallback(VerticaMetadataExtractor.DEFAULT_CONFIG)
        self._cluster = '{}'.format(conf.get_string(VerticaMetadataExtractor.CLUSTER_KEY))

        if conf.get_bool(VerticaMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME):
            cluster_source = "c.table_catalog"
        else:
            cluster_source = "'{}'".format(self._cluster)

        self._database = conf.get_string(VerticaMetadataExtractor.DATABASE_KEY, default='vertica')

        self.sql_stmt = VerticaMetadataExtractor.SQL_STATEMENT.format(
            where_clause_suffix=conf.get_string(VerticaMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY),
            cluster_source=cluster_source
        )

        self._alchemy_extractor = SQLAlchemyExtractor()
        sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())\
            .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))

        self.sql_stmt = sql_alch_conf.get_string(SQLAlchemyExtractor.EXTRACT_SQL)

        LOGGER.info('SQL for vertica metadata: {}'.format(self.sql_stmt))

        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter: Union[None, Iterator] = None

    def extract(self) -> Union[TableMetadata, None]:
        if not self._extract_iter:
            self._extract_iter = self._get_extract_iter()
        try:
            return next(self._extract_iter)
        except StopIteration:
            return None

    def get_scope(self) -> str:
        return 'extractor.vertica_metadata'

    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """
        for key, group in groupby(self._get_raw_extract_iter(), self._get_table_key):
            columns = []

            for row in group:
                last_row = row
                columns.append(ColumnMetadata(row['col_name'], None,
                                              row['col_type'], row['col_sort_order']))

            yield TableMetadata(self._database, last_row['cluster'],
                                last_row['schema'],
                                last_row['name'],
                                None,
                                columns,
                                is_view=last_row['is_view'])

    def _get_raw_extract_iter(self) -> Iterator[Dict[str, Any]]:
        """
        Provides iterator of result row from SQLAlchemy extractor
        :return:
        """
        row = self._alchemy_extractor.extract()
        while row:
            yield row
            row = self._alchemy_extractor.extract()

    def _get_table_key(self, row: Dict[str, Any]) -> Union[TableKey, None]:
        """
        Table key consists of schema and table name
        :param row:
        :return:
        """
        if row:
            return TableKey(schema=row['schema'], table_name=row['name'])

        return None
class SnowflakeMetadataExtractor(Extractor):
    """
    Extracts Snowflake table and column metadata from underlying meta store
    database using SQLAlchemyExtractor.
    Requirements:
        snowflake-connector-python
        snowflake-sqlalchemy
    """

    SQL_STATEMENT = """
    SELECT
        lower(c.column_name) AS col_name,
        c.comment AS col_description,
        lower(c.data_type) AS data_type,
        lower(c.ordinal_position) AS col_sort_order,
        lower('{database}') AS database,
        lower(c.table_catalog) AS cluster,
        lower(c.table_schema) AS schema,
        lower(c.table_name) AS name,
        t.comment AS description,
        decode(lower(t.table_type), 'view', 'true', 'false') AS is_view
    FROM
        {cluster}.INFORMATION_SCHEMA.COLUMNS AS c
    LEFT JOIN
        {cluster}.INFORMATION_SCHEMA.TABLES t
            ON c.TABLE_NAME = t.TABLE_NAME
            AND c.TABLE_SCHEMA = t.TABLE_SCHEMA
    {where_clause_suffix};
    """

    WHERE_CLAUSE_SUFFIX_KEY = "where_clause_suffix"
    DATABASE_KEY = "database"
    CLUSTER_KEY = "cluster"

    DEFAULT_CONFIG = ConfigFactory.from_dict({
        WHERE_CLAUSE_SUFFIX_KEY: "",
        DATABASE_KEY: "snowflake",
        CLUSTER_KEY: "master",
    })

    def init(self, conf: ConfigTree) -> None:
        self.conf = conf.with_fallback(
            SnowflakeMetadataExtractor.DEFAULT_CONFIG)
        self._database = self.conf.get_string(
            SnowflakeMetadataExtractor.DATABASE_KEY)
        self._cluster = "{}".format(
            self.conf.get_string(SnowflakeMetadataExtractor.CLUSTER_KEY))

        self.sql_stmt = SnowflakeMetadataExtractor.SQL_STATEMENT.format(
            where_clause_suffix=self.conf.get_string("where_clause_suffix"),
            cluster=self._cluster,
            database=self._database,
        )

        LOGGER.info("SQL for snowflake: {}".format(self.sql_stmt))

        self._alchemy_extractor = SQLAlchemyExtractor()
        sql_alchemy_scope = self._alchemy_extractor.get_scope()
        sql_alchemy_conf = Scoped.get_scoped_conf(conf, sql_alchemy_scope)
        sql_alchemy_conf.put(SQLAlchemyExtractor.EXTRACT_SQL, self.sql_stmt)

        self._alchemy_extractor.init(sql_alchemy_conf)
        self._extract_iter = None

    def extract(self) -> Optional[TableMetadata]:
        if not self._extract_iter:
            self._extract_iter = self._get_extract_iter()
        try:
            return next(self._extract_iter)
        except StopIteration:
            return None

    def get_scope(self) -> str:
        return "extractor.snowflake"

    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        """
        Using itertools.groupby and raw level iterator, it groups to table and
        yields TableMetadata
        :return:
        """
        for _, group in groupby(self._get_raw_extract_iter(),
                                self._get_table_key):
            columns = []
            for row in group:
                column_description = (unidecode(row["col_description"])
                                      if row["col_description"] else None)
                last_row = row
                columns.append(
                    ColumnMetadata(
                        name=row["col_name"],
                        description=column_description,
                        data_type=row["data_type"],
                        sort_order=row["col_sort_order"],
                    ))

            description = (unidecode(last_row["description"])
                           if last_row["description"] else None)

            yield TableMetadata(
                database=self._database,
                cluster=last_row["cluster"],
                schema=last_row["schema"],
                name=last_row["name"],
                description=description,
                columns=columns,
                is_view=last_row["is_view"] == "true",
            )

    def _get_raw_extract_iter(self) -> Iterator[Dict[str, Any]]:
        """
        Provides iterator of result row from SQLAlchemy extractor
        :return:
        """
        row = self._alchemy_extractor.extract()
        while row:
            yield row
            row = self._alchemy_extractor.extract()

    def _get_table_key(self, row: Dict[str, Any]) -> Optional[TableKey]:
        """
        Table key consists of schema and table name
        :param row:
        :return:
        """
        if row:
            return TableKey(schema=row["schema"], table_name=row["name"])

        return None
class HiveTableMetadataExtractor(Extractor):
    """
    Extracts Hive table and column metadata from underlying meta store database using SQLAlchemyExtractor
    """
    # SELECT statement from hive metastore database to extract table and column metadata
    # Below SELECT statement uses UNION to combining two queries together.
    # 1st query is retrieving partition columns
    # 2nd query is retrieving columns
    # Using UNION to combine above two statements and order by table & partition identifier.
    SQL_STATEMENT = """
    SELECT source.* FROM
    (SELECT t.TBL_ID, d.NAME as `schema`, t.TBL_NAME name, t.TBL_TYPE, tp.PARAM_VALUE as description,
           p.PKEY_NAME as col_name, p.INTEGER_IDX as col_sort_order,
           p.PKEY_TYPE as col_type, p.PKEY_COMMENT as col_description, 1 as "is_partition_col",
           IF(t.TBL_TYPE = 'VIRTUAL_VIEW', 1, 0) "is_view"
    FROM TBLS t
    JOIN DBS d ON t.DB_ID = d.DB_ID
    JOIN PARTITION_KEYS p ON t.TBL_ID = p.TBL_ID
    LEFT JOIN TABLE_PARAMS tp ON (t.TBL_ID = tp.TBL_ID AND tp.PARAM_KEY='comment')
    {where_clause_suffix}
    UNION
    SELECT t.TBL_ID, d.NAME as `schema`, t.TBL_NAME name, t.TBL_TYPE, tp.PARAM_VALUE as description,
           c.COLUMN_NAME as col_name, c.INTEGER_IDX as col_sort_order,
           c.TYPE_NAME as col_type, c.COMMENT as col_description, 0 as "is_partition_col",
           IF(t.TBL_TYPE = 'VIRTUAL_VIEW', 1, 0) "is_view"
    FROM TBLS t
    JOIN DBS d ON t.DB_ID = d.DB_ID
    JOIN SDS s ON t.SD_ID = s.SD_ID
    JOIN COLUMNS_V2 c ON s.CD_ID = c.CD_ID
    LEFT JOIN TABLE_PARAMS tp ON (t.TBL_ID = tp.TBL_ID AND tp.PARAM_KEY='comment')
    {where_clause_suffix}
    ) source
    ORDER by tbl_id, is_partition_col desc;
    """

    # CONFIG KEYS
    WHERE_CLAUSE_SUFFIX_KEY = 'where_clause_suffix'
    CLUSTER_KEY = 'cluster'

    DEFAULT_CONFIG = ConfigFactory.from_dict({
        WHERE_CLAUSE_SUFFIX_KEY: ' ',
        CLUSTER_KEY: 'gold'
    })

    def init(self, conf):
        # type: (ConfigTree) -> None
        conf = conf.with_fallback(HiveTableMetadataExtractor.DEFAULT_CONFIG)
        self._cluster = '{}'.format(
            conf.get_string(HiveTableMetadataExtractor.CLUSTER_KEY))

        self.sql_stmt = HiveTableMetadataExtractor.SQL_STATEMENT.format(
            where_clause_suffix=conf.get_string(
                HiveTableMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY))

        LOGGER.info('SQL for hive metastore: {}'.format(self.sql_stmt))

        self._alchemy_extractor = SQLAlchemyExtractor()
        sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())\
            .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))

        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter = None  # type: Union[None, Iterator]

    def extract(self):
        # type: () -> Union[TableMetadata, None]
        if not self._extract_iter:
            self._extract_iter = self._get_extract_iter()
        try:
            return next(self._extract_iter)
        except StopIteration:
            return None

    def get_scope(self):
        # type: () -> str
        return 'extractor.hive_table_metadata'

    def _get_extract_iter(self):
        # type: () -> Iterator[TableMetadata]
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """
        for key, group in groupby(self._get_raw_extract_iter(),
                                  self._get_table_key):
            columns = []

            for row in group:
                last_row = row
                columns.append(
                    ColumnMetadata(row['col_name'], row['col_description'],
                                   row['col_type'], row['col_sort_order']))
            is_view = last_row['is_view'] == 1
            yield TableMetadata('hive',
                                self._cluster,
                                last_row['schema'],
                                last_row['name'],
                                last_row['description'],
                                columns,
                                is_view=is_view)

    def _get_raw_extract_iter(self):
        # type: () -> Iterator[Dict[str, Any]]
        """
        Provides iterator of result row from SQLAlchemy extractor
        :return:
        """
        row = self._alchemy_extractor.extract()
        while row:
            yield row
            row = self._alchemy_extractor.extract()

    def _get_table_key(self, row):
        # type: (Dict[str, Any]) -> Union[TableKey, None]
        """
        Table key consists of schema and table name
        :param row:
        :return:
        """
        if row:
            return TableKey(schema=row['schema'], table_name=row['name'])

        return None
Пример #11
0
class BasePostgresMetadataExtractor(Extractor):
    """
    Extracts Postgres table and column metadata from underlying meta store database using SQLAlchemyExtractor
    """

    # CONFIG KEYS
    WHERE_CLAUSE_SUFFIX_KEY = 'where_clause_suffix'
    CLUSTER_KEY = 'cluster_key'
    USE_CATALOG_AS_CLUSTER_NAME = 'use_catalog_as_cluster_name'
    DATABASE_KEY = 'database_key'

    # Default values
    DEFAULT_CLUSTER_NAME = 'master'

    DEFAULT_CONFIG = ConfigFactory.from_dict({
        WHERE_CLAUSE_SUFFIX_KEY: 'true',
        CLUSTER_KEY: DEFAULT_CLUSTER_NAME,
        USE_CATALOG_AS_CLUSTER_NAME: True
    })

    @abc.abstractmethod
    def get_sql_statement(self, use_catalog_as_cluster_name: bool,
                          where_clause_suffix: str) -> Any:
        """
        :return: Provides a record or None if no more to extract
        """
        return None

    def init(self, conf: ConfigTree) -> None:
        conf = conf.with_fallback(BasePostgresMetadataExtractor.DEFAULT_CONFIG)
        self._cluster = conf.get_string(
            BasePostgresMetadataExtractor.CLUSTER_KEY)

        self._database = conf.get_string(
            BasePostgresMetadataExtractor.DATABASE_KEY, default='postgres')

        self.sql_stmt = self.get_sql_statement(
            use_catalog_as_cluster_name=conf.get_bool(
                BasePostgresMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME),
            where_clause_suffix=conf.get_string(
                BasePostgresMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY),
        )

        self._alchemy_extractor = SQLAlchemyExtractor()
        sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())\
            .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))

        self.sql_stmt = sql_alch_conf.get_string(
            SQLAlchemyExtractor.EXTRACT_SQL)

        LOGGER.info('SQL for postgres metadata: %s', self.sql_stmt)

        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter: Union[None, Iterator] = None

    def extract(self) -> Union[TableMetadata, None]:
        if not self._extract_iter:
            self._extract_iter = self._get_extract_iter()
        try:
            return next(self._extract_iter)
        except StopIteration:
            return None

    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """
        for key, group in groupby(self._get_raw_extract_iter(),
                                  self._get_table_key):
            columns = []

            for row in group:
                last_row = row
                columns.append(
                    ColumnMetadata(row['col_name'], row['col_description'],
                                   row['col_type'], row['col_sort_order']))

            yield TableMetadata(self._database, last_row['cluster'],
                                last_row['schema'], last_row['name'],
                                last_row['description'], columns)

    def _get_raw_extract_iter(self) -> Iterator[Dict[str, Any]]:
        """
        Provides iterator of result row from SQLAlchemy extractor
        :return:
        """
        row = self._alchemy_extractor.extract()
        while row:
            yield row
            row = self._alchemy_extractor.extract()

    def _get_table_key(self, row: Dict[str, Any]) -> Union[TableKey, None]:
        """
        Table key consists of schema and table name
        :param row:
        :return:
        """
        if row:
            return TableKey(schema=row['schema'], table_name=row['name'])

        return None
Пример #12
0
class SnowflakeTableLastUpdatedExtractor(Extractor):
    """
    Extracts Snowflake table last update time from INFORMATION_SCHEMA metadata tables using SQLAlchemyExtractor.
    Requirements:
        snowflake-connector-python
        snowflake-sqlalchemy
    """
    # https://docs.snowflake.com/en/sql-reference/info-schema/views.html#columns
    # 'last_altered' column in 'TABLES` metadata view under 'INFORMATION_SCHEMA' contains last time when the table was
    # updated (both DML and DDL update). Below query fetches that column for each table.
    SQL_STATEMENT = """
        SELECT
            lower({cluster_source}) AS cluster,
            lower(t.table_schema) AS schema,
            lower(t.table_name) AS table_name,
            DATE_PART(EPOCH, t.last_altered) AS last_updated_time
        FROM
            {database}.INFORMATION_SCHEMA.TABLES t
        {where_clause_suffix};
        """

    # CONFIG KEYS
    WHERE_CLAUSE_SUFFIX_KEY = 'where_clause_suffix'
    CLUSTER_KEY = 'cluster_key'
    USE_CATALOG_AS_CLUSTER_NAME = 'use_catalog_as_cluster_name'
    # Database Key, used to identify the database type in the UI.
    DATABASE_KEY = 'database_key'
    # Snowflake Database Key, used to determine which Snowflake database to connect to.
    SNOWFLAKE_DATABASE_KEY = 'snowflake_database'

    # Default values
    DEFAULT_CLUSTER_NAME = 'master'

    DEFAULT_CONFIG = ConfigFactory.from_dict({
        WHERE_CLAUSE_SUFFIX_KEY: ' WHERE t.last_altered IS NOT NULL ',
        CLUSTER_KEY: DEFAULT_CLUSTER_NAME,
        USE_CATALOG_AS_CLUSTER_NAME: True,
        DATABASE_KEY: 'snowflake',
        SNOWFLAKE_DATABASE_KEY: 'prod'
    })

    def init(self, conf: ConfigTree) -> None:
        conf = conf.with_fallback(
            SnowflakeTableLastUpdatedExtractor.DEFAULT_CONFIG)

        if conf.get_bool(SnowflakeTableLastUpdatedExtractor.
                         USE_CATALOG_AS_CLUSTER_NAME):
            cluster_source = "t.table_catalog"
        else:
            cluster_source = "'{}'".format(
                conf.get_string(
                    SnowflakeTableLastUpdatedExtractor.CLUSTER_KEY))

        self._database = conf.get_string(
            SnowflakeTableLastUpdatedExtractor.DATABASE_KEY)
        self._snowflake_database = conf.get_string(
            SnowflakeTableLastUpdatedExtractor.SNOWFLAKE_DATABASE_KEY)

        self.sql_stmt = SnowflakeTableLastUpdatedExtractor.SQL_STATEMENT.format(
            where_clause_suffix=conf.get_string(
                SnowflakeTableLastUpdatedExtractor.WHERE_CLAUSE_SUFFIX_KEY),
            cluster_source=cluster_source,
            database=self._snowflake_database)

        LOGGER.info(
            'SQL for snowflake table last updated timestamp: {}'.format(
                self.sql_stmt))

        # use an sql_alchemy_extractor to execute sql
        self._alchemy_extractor = SQLAlchemyExtractor()
        sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope()) \
            .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))

        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter: Union[None, Iterator] = None

    def extract(self) -> Union[TableLastUpdated, None]:
        if not self._extract_iter:
            self._extract_iter = self._get_extract_iter()
        try:
            return next(self._extract_iter)
        except StopIteration:
            return None

    def get_scope(self) -> str:
        return 'extractor.snowflake_table_last_updated'

    def _get_extract_iter(self) -> Iterator[TableLastUpdated]:
        """
        Provides iterator of result row from SQLAlchemy extractor
        """
        tbl_last_updated_row = self._alchemy_extractor.extract()
        while tbl_last_updated_row:
            yield TableLastUpdated(
                table_name=tbl_last_updated_row['table_name'],
                last_updated_time_epoch=tbl_last_updated_row[
                    'last_updated_time'],
                schema=tbl_last_updated_row['schema'],
                db=self._database,
                cluster=tbl_last_updated_row['cluster'])
            tbl_last_updated_row = self._alchemy_extractor.extract()
class MSSQLMetadataExtractor(Extractor):
    """
    Extracts Microsoft SQL Server table and column metadata from underlying
    meta store database using SQLAlchemyExtractor
    """

    # SELECT statement from MS SQL to extract table and column metadata
    SQL_STATEMENT = """
        SELECT DISTINCT
            {cluster_source} AS cluster,
            TBL.TABLE_SCHEMA AS [schema_name],
            TBL.TABLE_NAME AS [name],
            CAST(PROP.VALUE AS NVARCHAR(MAX)) AS [description],
            COL.COLUMN_NAME AS [col_name],
            COL.DATA_TYPE AS [col_type],
            CAST(PROP_COL.VALUE AS NVARCHAR(MAX)) AS [col_description],
            COL.ORDINAL_POSITION AS col_sort_order
        FROM INFORMATION_SCHEMA.TABLES TBL
        INNER JOIN INFORMATION_SCHEMA.COLUMNS COL
            ON (COL.TABLE_NAME = TBL.TABLE_NAME
                AND COL.TABLE_SCHEMA = TBL.TABLE_SCHEMA )
        LEFT JOIN SYS.EXTENDED_PROPERTIES PROP
            ON (PROP.MAJOR_ID = OBJECT_ID(TBL.TABLE_SCHEMA + '.' + TBL.TABLE_NAME)
                AND PROP.MINOR_ID = 0
                AND PROP.NAME = 'MS_Description')
        LEFT JOIN SYS.EXTENDED_PROPERTIES PROP_COL
            ON (PROP_COL.MAJOR_ID = OBJECT_ID(TBL.TABLE_SCHEMA + '.' + TBL.TABLE_NAME)
                AND PROP_COL.MINOR_ID = COL.ORDINAL_POSITION
                AND PROP_COL.NAME = 'MS_Description')
        WHERE TBL.TABLE_TYPE = 'base table' {where_clause_suffix}
        ORDER BY
            CLUSTER,
            SCHEMA_NAME,
            NAME,
            COL_SORT_ORDER
        ;
    """

    # CONFIG KEYS
    WHERE_CLAUSE_SUFFIX_KEY = 'where_clause_suffix'
    CLUSTER_KEY = 'cluster_key'
    USE_CATALOG_AS_CLUSTER_NAME = 'use_catalog_as_cluster_name'
    DATABASE_KEY = 'database_key'

    # Default values
    DEFAULT_CLUSTER_NAME = 'DB_NAME()'

    DEFAULT_CONFIG = ConfigFactory.from_dict({
        WHERE_CLAUSE_SUFFIX_KEY: '',
        CLUSTER_KEY: DEFAULT_CLUSTER_NAME,
        USE_CATALOG_AS_CLUSTER_NAME: True
    })

    DEFAULT_WHERE_CLAUSE_VALUE = 'and tbl.table_schema in {schemas}'

    def init(self, conf: ConfigTree) -> None:
        conf = conf.with_fallback(MSSQLMetadataExtractor.DEFAULT_CONFIG)

        self._cluster = conf.get_string(MSSQLMetadataExtractor.CLUSTER_KEY)

        if conf.get_bool(MSSQLMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME):
            cluster_source = "DB_NAME()"
        else:
            cluster_source = f"'{self._cluster}'"

        self._database = conf.get_string(MSSQLMetadataExtractor.DATABASE_KEY,
                                         default='mssql')

        config_where_clause = conf.get_string(
            MSSQLMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY)

        LOGGER.info("Crawling for Schemas %s", config_where_clause)

        if config_where_clause:
            where_clause_suffix = MSSQLMetadataExtractor \
                .DEFAULT_WHERE_CLAUSE_VALUE \
                .format(schemas=config_where_clause)
        else:
            where_clause_suffix = ''

        self.sql_stmt = MSSQLMetadataExtractor.SQL_STATEMENT.format(
            where_clause_suffix=where_clause_suffix,
            cluster_source=cluster_source)

        LOGGER.info('SQL for MS SQL Metadata: %s', self.sql_stmt)

        self._alchemy_extractor = SQLAlchemyExtractor()
        sql_alch_conf = Scoped \
            .get_scoped_conf(conf, self._alchemy_extractor.get_scope()) \
            .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))

        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter: Union[None, Iterator] = None

    def extract(self) -> Union[TableMetadata, None]:
        if not self._extract_iter:
            self._extract_iter = self._get_extract_iter()
        try:
            return next(self._extract_iter)
        except StopIteration:
            return None

    def get_scope(self) -> str:
        return 'extractor.mssql_metadata'

    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        """
        Using itertools.groupby and raw level iterator,
        it groups to table and yields TableMetadata
        :return:
        """
        for key, group in groupby(self._get_raw_extract_iter(),
                                  self._get_table_key):
            columns = []

            for row in group:
                last_row = row
                columns.append(
                    ColumnMetadata(row['col_name'], row['col_description'],
                                   row['col_type'], row['col_sort_order']))

            yield TableMetadata(self._database,
                                last_row['cluster'],
                                last_row['schema_name'],
                                last_row['name'],
                                last_row['description'],
                                columns,
                                tags=last_row['schema_name'])

    def _get_raw_extract_iter(self) -> Iterator[Dict[str, Any]]:
        """
        Provides iterator of result row from SQLAlchemy extractor
        :return:
        """
        row = self._alchemy_extractor.extract()
        while row:
            yield row
            row = self._alchemy_extractor.extract()

    def _get_table_key(self, row: Dict[str, Any]) -> Union[TableKey, None]:
        """
        Table key consists of schema and table name
        :param row:
        :return:
        """
        if row:
            return TableKey(schema_name=row['schema_name'],
                            table_name=row['name'])

        return None
class SnowflakeMetadataExtractor(Extractor):
    """
    Extracts Snowflake table and column metadata from underlying meta store database using SQLAlchemyExtractor.
    Requirements:
        snowflake-connector-python
        snowflake-sqlalchemy
    """
    # SELECT statement from snowflake information_schema to extract table and column metadata
    # https://docs.snowflake.com/en/sql-reference/account-usage.html#label-account-usage-views
    # This can be modified to use account_usage for performance at the cost of latency if necessary.
    SQL_STATEMENT = """
    SELECT
        lower(c.column_name) AS col_name,
        c.comment AS col_description,
        lower(c.data_type) AS col_type,
        lower(c.ordinal_position) AS col_sort_order,
        lower(c.table_catalog) AS database,
        lower({cluster_source}) AS cluster,
        lower(c.table_schema) AS schema,
        lower(c.table_name) AS name,
        t.comment AS description,
        decode(lower(t.table_type), 'view', 'true', 'false') AS is_view
    FROM
        {database}.{schema}.COLUMNS AS c
    LEFT JOIN
        {database}.{schema}.TABLES t
            ON c.TABLE_NAME = t.TABLE_NAME
            AND c.TABLE_SCHEMA = t.TABLE_SCHEMA
    {where_clause_suffix};
    """

    # CONFIG KEYS
    WHERE_CLAUSE_SUFFIX_KEY = 'where_clause_suffix'
    CLUSTER_KEY = 'cluster_key'
    USE_CATALOG_AS_CLUSTER_NAME = 'use_catalog_as_cluster_name'
    # Database Key, used to identify the database type in the UI.
    DATABASE_KEY = 'database_key'
    # Snowflake Database Key, used to determine which Snowflake database to connect to.
    SNOWFLAKE_DATABASE_KEY = 'snowflake_database'
    # Snowflake Schema Key, used to determine which Snowflake schema to use.
    SNOWFLAKE_SCHEMA_KEY = 'snowflake_schema'

    # Default values
    DEFAULT_CLUSTER_NAME = 'master'

    DEFAULT_CONFIG = ConfigFactory.from_dict({
        WHERE_CLAUSE_SUFFIX_KEY:
        ' ',
        CLUSTER_KEY:
        DEFAULT_CLUSTER_NAME,
        USE_CATALOG_AS_CLUSTER_NAME:
        True,
        DATABASE_KEY:
        'snowflake',
        SNOWFLAKE_DATABASE_KEY:
        'prod',
        SNOWFLAKE_SCHEMA_KEY:
        'INFORMATION_SCHEMA'
    })

    def init(self, conf: ConfigTree) -> None:
        conf = conf.with_fallback(SnowflakeMetadataExtractor.DEFAULT_CONFIG)
        self._cluster = '{}'.format(
            conf.get_string(SnowflakeMetadataExtractor.CLUSTER_KEY))

        if conf.get_bool(
                SnowflakeMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME):
            cluster_source = "c.table_catalog"
        else:
            cluster_source = "'{}'".format(self._cluster)

        self._database = conf.get_string(
            SnowflakeMetadataExtractor.DATABASE_KEY)
        self._schema = conf.get_string(SnowflakeMetadataExtractor.DATABASE_KEY)
        self._snowflake_database = conf.get_string(
            SnowflakeMetadataExtractor.SNOWFLAKE_DATABASE_KEY)
        self._snowflake_schema = conf.get_string(
            SnowflakeMetadataExtractor.SNOWFLAKE_SCHEMA_KEY)

        self.sql_stmt = SnowflakeMetadataExtractor.SQL_STATEMENT.format(
            where_clause_suffix=conf.get_string(
                SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY),
            cluster_source=cluster_source,
            database=self._snowflake_database,
            schema=self._snowflake_schema)

        LOGGER.info('SQL for snowflake metadata: {}'.format(self.sql_stmt))

        self._alchemy_extractor = SQLAlchemyExtractor()
        sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())\
            .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))

        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter: Union[None, Iterator] = None

    def extract(self) -> Union[TableMetadata, None]:
        if not self._extract_iter:
            self._extract_iter = self._get_extract_iter()
        try:
            return next(self._extract_iter)
        except StopIteration:
            return None

    def get_scope(self) -> str:
        return 'extractor.snowflake'

    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """
        for key, group in groupby(self._get_raw_extract_iter(),
                                  self._get_table_key):
            columns = []

            for row in group:
                last_row = row
                columns.append(
                    ColumnMetadata(
                        row['col_name'],
                        unidecode(row['col_description'])
                        if row['col_description'] else None, row['col_type'],
                        row['col_sort_order']))

            yield TableMetadata(
                self._database, last_row['cluster'], last_row['schema'],
                last_row['name'],
                unidecode(last_row['description']) if last_row['description']
                else None, columns, last_row['is_view'] == 'true')

    def _get_raw_extract_iter(self) -> Iterator[Dict[str, Any]]:
        """
        Provides iterator of result row from SQLAlchemy extractor
        :return:
        """
        row = self._alchemy_extractor.extract()
        while row:
            yield row
            row = self._alchemy_extractor.extract()

    def _get_table_key(self, row: Dict[str, Any]) -> Union[TableKey, None]:
        """
        Table key consists of schema and table name
        :param row:
        :return:
        """
        if row:
            return TableKey(schema=row['schema'], table_name=row['name'])

        return None
class OracleMetadataExtractor(Extractor):
    """
    Extracts Oracle Autonomous Databas table and column metadata from underlying meta store database using SQLAlchemyExtractor
    """
    # SELECT statement from oracle user_tab_columns to extract table and column metadata
    # Column and Table Descriptions is taken from user_tab_columns and user_col_comments
    SQL_STATEMENT = """
    SELECT {cluster_source} as "cluster",(SELECT sys_context('USERENV', 'CURRENT_SCHEMA') FROM dual) as "schema_name", a.table_name as "name", c.comments as "description",
    a.column_name as "col_name", a.data_type as "col_type", d.comments as "col_description", a.column_id as "col_sort_order"
    FROM user_tab_columns a
    JOIN user_all_tables b ON a.table_name = b.table_name
    JOIN user_tab_comments c  ON a.table_name = c.table_name
    JOIN user_col_comments d ON a.table_name = d.table_name AND a.column_name =  d.column_name
    ORDER BY b.cluster_name,(SELECT sys_context('USERENV', 'CURRENT_SCHEMA') FROM dual),a.table_name,a.column_id
    """

    # CONFIG KEYS
    CLUSTER_KEY = 'cluster_key'
    USE_CATALOG_AS_CLUSTER_NAME = 'use_catalog_as_cluster_name'
    DATABASE_KEY = 'database_key'

    #Default values
    DEFAULT_CLUSTER_NAME = 'master'
    DEFAULT_DATABASE_NAME = 'autonomous'

    DEFAULT_CONFIG = ConfigFactory.from_dict({
        CLUSTER_KEY:
        DEFAULT_CLUSTER_NAME,
        USE_CATALOG_AS_CLUSTER_NAME:
        False,
        DATABASE_KEY:
        DEFAULT_DATABASE_NAME
    })

    def init(self, conf):
        # type: (ConfigTree) -> None
        conf = conf.with_fallback(OracleMetadataExtractor.DEFAULT_CONFIG)
        self._cluster = '{}'.format(
            conf.get_string(OracleMetadataExtractor.CLUSTER_KEY))

        ##setting cluster name based on config
        if conf.get_bool(OracleMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME):
            cluster_source = "b.cluster_name"
        else:
            cluster_source = "'{}'".format(self._cluster)

        database = conf.get_string(OracleMetadataExtractor.DATABASE_KEY,
                                   default='oracle')

        self._database = database

        self.sql_stmt = OracleMetadataExtractor.SQL_STATEMENT.format(
            cluster_source=cluster_source)
        LOGGER.info('SQL for oracle metadata: {}'.format(self.sql_stmt))

        self._alchemy_extractor = SQLAlchemyExtractor()
        sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())\
            .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))

        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter = None  # type: Union[None, Iterator]

    def extract(self):
        # type: () -> Union[TableMetadata, None]
        if not self._extract_iter:
            self._extract_iter = self._get_extract_iter()
        try:
            return next(self._extract_iter)
        except StopIteration:
            return None

    def get_scope(self):
        # type: () -> str
        return 'extractor.oracle_metadata'

    def _get_extract_iter(self):
        # type: () -> Iterator[TableMetadata]
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """
        for key, group in groupby(self._get_raw_extract_iter(),
                                  self._get_table_key):
            columns = []

            for row in group:
                last_row = row
                columns.append(
                    ColumnMetadata(row['col_name'], row['col_description'],
                                   row['col_type'], row['col_sort_order']))

            yield TableMetadata(self._database, last_row['cluster'],
                                last_row['schema_name'], last_row['name'],
                                last_row['description'], columns)

    def _get_raw_extract_iter(self):
        # type: () -> Iterator[Dict[str, Any]]
        """
        Provides iterator of result row from SQLAlchemy extractor
        :return:
        """
        row = self._alchemy_extractor.extract()
        while row:
            yield row
            row = self._alchemy_extractor.extract()

    def _get_table_key(self, row):
        # type: (Dict[str, Any]) -> Union[TableKey, None]
        """
        Table key consists of schema and table name
        :param row:
        :return:
        """
        if row:
            return TableKey(schema_name=row['schema_name'],
                            table_name=row['name'])

        return None
class PrestoViewMetadataExtractor(Extractor):
    """
    Extracts Presto View and column metadata from underlying meta store database using SQLAlchemyExtractor
    PrestoViewMetadataExtractor does not require a separate table model but just reuse the existing TableMetadata
    """
    # SQL statement to extract View metadata
    # {where_clause_suffix} could be used to filter schemas
    SQL_STATEMENT = """
    SELECT t.TBL_ID, d.NAME as `schema`, t.TBL_NAME name, t.TBL_TYPE, t.VIEW_ORIGINAL_TEXT as view_original_text
    FROM TBLS t
    JOIN DBS d ON t.DB_ID = d.DB_ID
    WHERE t.VIEW_EXPANDED_TEXT = '/* Presto View */'
    {where_clause_suffix}
    ORDER BY t.TBL_ID desc;
    """

    # Presto View data prefix and suffix definition:
    # https://github.com/prestodb/presto/blob/43bd519052ba4c56ff1f4fc807075637ab5f4f10/presto-hive/src/main/java/com/facebook/presto/hive/HiveUtil.java#L153-L154
    PRESTO_VIEW_PREFIX = '/* Presto View: '
    PRESTO_VIEW_SUFFIX = ' */'

    # CONFIG KEYS
    WHERE_CLAUSE_SUFFIX_KEY = 'where_clause_suffix'
    CLUSTER_KEY = 'cluster'

    DEFAULT_CONFIG = ConfigFactory.from_dict({
        WHERE_CLAUSE_SUFFIX_KEY: ' ',
        CLUSTER_KEY: 'gold'
    })

    def init(self, conf):
        # type: (ConfigTree) -> None
        conf = conf.with_fallback(PrestoViewMetadataExtractor.DEFAULT_CONFIG)
        self._cluster = '{}'.format(
            conf.get_string(PrestoViewMetadataExtractor.CLUSTER_KEY))

        self.sql_stmt = PrestoViewMetadataExtractor.SQL_STATEMENT.format(
            where_clause_suffix=conf.get_string(
                PrestoViewMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY))

        LOGGER.info('SQL for hive metastore: {}'.format(self.sql_stmt))

        self._alchemy_extractor = SQLAlchemyExtractor()
        sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())\
            .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))

        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter = None  # type: Union[None, Iterator]

    def extract(self):
        # type: () -> Union[TableMetadata, None]
        if not self._extract_iter:
            self._extract_iter = self._get_extract_iter()
        try:
            return next(self._extract_iter)
        except StopIteration:
            return None

    def get_scope(self):
        # type: () -> str
        return 'extractor.presto_view_metadata'

    def _get_extract_iter(self):
        # type: () -> Iterator[TableMetadata]
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """
        row = self._alchemy_extractor.extract()
        while row:
            columns = self._get_column_metadata(row['view_original_text'])
            yield TableMetadata(database='presto',
                                cluster=self._cluster,
                                schema=row['schema'],
                                name=row['name'],
                                description=None,
                                columns=columns,
                                is_view=True)
            row = self._alchemy_extractor.extract()

    def _get_column_metadata(self, view_original_text):
        # type: (str) -> List[ColumnMetadata]
        """
        Get Column Metadata from VIEW_ORIGINAL_TEXT from TBLS table for Presto Views.
        Columns are sorted the same way as they appear in Presto Create View SQL.
        :param view_original_text:
        :return:
        """
        # remove encoded Presto View data prefix and suffix
        encoded_view_info = (view_original_text.split(
            PrestoViewMetadataExtractor.PRESTO_VIEW_PREFIX,
            1)[-1].rsplit(PrestoViewMetadataExtractor.PRESTO_VIEW_SUFFIX,
                          1)[0])

        # view_original_text is b64 encoded:
        # https://github.com/prestodb/presto/blob/43bd519052ba4c56ff1f4fc807075637ab5f4f10/presto-hive/src/main/java/com/facebook/presto/hive/HiveUtil.java#L602-L605
        decoded_view_info = base64.b64decode(encoded_view_info)
        columns = json.loads(decoded_view_info).get('columns')

        return [
            ColumnMetadata(name=column['name'],
                           description=None,
                           col_type=column['type'],
                           sort_order=i) for i, column in enumerate(columns)
        ]
Пример #17
0
class PrestoTableMetadataExtractor(Extractor):
    """
    Extracts Hive table and column metadata from underlying meta store database using SQLAlchemyExtractor
    """

    WHERE_CLAUSE_SUFFIX_KEY = "where_clause_suffix"
    CLUSTER_KEY = "cluster"
    DATABASE_KEY = "database"

    SQL_STATEMENT = """
    SELECT
      a.table_catalog AS catalog
      , a.table_schema AS schema
      , a.table_name AS name
      , NULL AS description
      , a.column_name AS col_name
      , a.ordinal_position as col_sort_order
      , IF(a.extra_info = 'partition key', 1, 0) AS is_partition_col
      , a.comment AS col_description
      , a.data_type AS col_type
      , IF(b.table_name is not null, 1, 0) AS is_view
    FROM {cluster_prefix}information_schema.columns a
    LEFT JOIN {cluster_prefix}information_schema.views b ON a.table_catalog = b.table_catalog
        and a.table_schema = b.table_schema
        and a.table_name = b.table_name
    {where_clause_suffix}
    """

    # Config keys.
    DEFAULT_CONFIG = ConfigFactory.from_dict(
        {
            WHERE_CLAUSE_SUFFIX_KEY: "WHERE a.table_schema NOT IN ('pg_catalog', 'information_schema')",
            DATABASE_KEY: "presto",
        }
    )

    def init(self, conf):
        # type: (ConfigTree) -> None
        self.conf = conf.with_fallback(PrestoTableMetadataExtractor.DEFAULT_CONFIG)
        self._database = "{}".format(
            self.conf.get_string(PrestoTableMetadataExtractor.DATABASE_KEY)
        )
        self._cluster = self.conf.get(PrestoTableMetadataExtractor.CLUSTER_KEY, None)
        LOGGER.info("Cluster name: {}".format(self._cluster))

        if self._cluster is not None:
            cluster_prefix = self._cluster + "."
        else:
            cluster_prefix = ""

        self.sql_stmt = PrestoTableMetadataExtractor.SQL_STATEMENT.format(
            cluster_prefix=cluster_prefix,
            where_clause_suffix=self.conf.get_string(
                PrestoTableMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY
            )
            or "",
        )

        LOGGER.info("SQL for presto: {}".format(self.sql_stmt))

        self._alchemy_extractor = SQLAlchemyExtractor()
        sql_alch_conf = Scoped.get_scoped_conf(
            self.conf, self._alchemy_extractor.get_scope()
        ).with_fallback(
            ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt})
        )

        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter = None  # type: Union[None, Iterator]

    def extract(self):
        # type: () -> Union[TableMetadata, None]
        if not self._extract_iter:
            self._extract_iter = self._get_extract_iter()
        try:
            return next(self._extract_iter)
        except StopIteration:
            return None

    def get_scope(self):
        # type: () -> str
        return "extractor.presto_table_metadata"

    def _get_extract_iter(self):
        # type: () -> Iterator[TableMetadata]
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """
        for _, group in groupby(self._get_raw_extract_iter(), self._get_table_key):
            columns = []

            for row in group:
                last_row = row
                columns.append(
                    ColumnMetadata(
                        row["col_name"],
                        row["col_description"],
                        row["col_type"],
                        row["col_sort_order"],
                    )
                )

            yield TableMetadata(
                self._database,
                self._cluster,
                last_row["schema"],
                last_row["name"],
                last_row["description"],
                columns,
                is_view=bool(last_row["is_view"]),
            )

    def _get_raw_extract_iter(self):
        # type: () -> Iterator[Dict[str, Any]]
        """
        Provides iterator of result row from SQLAlchemy extractor
        :return:
        """
        row = self._alchemy_extractor.extract()
        while row:
            yield row
            row = self._alchemy_extractor.extract()

    def _get_table_key(self, row):
        # type: (Dict[str, Any]) -> Union[TableKey, None]
        """
        Table key consists of schema and table name
        :param row:
        :return:
        """
        if row:
            return TableKey(schema=row["schema"], table_name=row["name"])
        return None
Пример #18
0
class OracleMetadataExtractor(Extractor):
    """
    Extracts Oracle table and column metadata from underlying meta store database using SQLAlchemyExtractor
    """

    # CONFIG KEYS
    WHERE_CLAUSE_SUFFIX_KEY = 'where_clause_suffix'
    CLUSTER_KEY = 'cluster_key'
    DATABASE_KEY = 'database_key'

    # Default values
    DEFAULT_CLUSTER_NAME = 'master'

    DEFAULT_CONFIG = ConfigFactory.from_dict({
        WHERE_CLAUSE_SUFFIX_KEY: ' ',
        CLUSTER_KEY: DEFAULT_CLUSTER_NAME
    })

    def init(self, conf: ConfigTree) -> None:
        conf = conf.with_fallback(OracleMetadataExtractor.DEFAULT_CONFIG)
        self._cluster = conf.get_string(OracleMetadataExtractor.CLUSTER_KEY,
                                        default='oracle')

        self._database = conf.get_string(OracleMetadataExtractor.DATABASE_KEY,
                                         default='oracle')

        self.sql_stmt = self.get_sql_statement(
            where_clause_suffix=conf.get_string(
                OracleMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY), )

        self._alchemy_extractor = SQLAlchemyExtractor()
        sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope()) \
            .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))

        self.sql_stmt = sql_alch_conf.get_string(
            SQLAlchemyExtractor.EXTRACT_SQL)

        LOGGER.info('SQL for oracle metadata: %s', self.sql_stmt)

        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter: Union[None, Iterator] = None

    def extract(self) -> Union[TableMetadata, None]:
        if not self._extract_iter:
            self._extract_iter = self._get_extract_iter()
        try:
            return next(self._extract_iter)
        except StopIteration:
            return None

    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """
        for key, group in groupby(self._get_raw_extract_iter(),
                                  self._get_table_key):
            columns = []

            for row in group:
                last_row = row
                columns.append(
                    ColumnMetadata(row['col_name'], row['col_description'],
                                   row['col_type'], row['col_sort_order']))

            yield TableMetadata(self._database, last_row['cluster'],
                                last_row['schema'], last_row['name'],
                                last_row['description'], columns)

    def _get_raw_extract_iter(self) -> Iterator[Dict[str, Any]]:
        """
        Provides iterator of result row from SQLAlchemy extractor
        :return:
        """
        row = self._alchemy_extractor.extract()
        while row:
            yield row
            row = self._alchemy_extractor.extract()

    def _get_table_key(self, row: Dict[str, Any]) -> Union[TableKey, None]:
        """
        Table key consists of schema and table name
        :param row:
        :return:
        """
        if row:
            return TableKey(schema=row['schema'], table_name=row['name'])

        return None

    def get_sql_statement(self, where_clause_suffix: str) -> str:
        cluster_source = f"'{self._cluster}'"

        return """
        SELECT
            {cluster_source} as "cluster",
            lower(c.owner) as "schema",
            lower(c.table_name) as "name",
            tc.comments as "description",
            lower(c.column_name) as "col_name",
            lower(c.data_type) as "col_type",
            cc.comments as "col_description",
            lower(c.column_id) as "col_sort_order"
        FROM
            all_tab_columns c
        LEFT JOIN
            all_tab_comments tc ON c.owner=tc.owner AND c.table_name=tc.table_name
        LEFT JOIN
            all_col_comments cc ON c.owner=cc.owner AND c.table_name=cc.table_name AND c.column_name=cc.column_name
        {where_clause_suffix}
        ORDER BY "cluster", "schema", "name", "col_sort_order"
        """.format(
            cluster_source=cluster_source,
            where_clause_suffix=where_clause_suffix,
        )

    def get_scope(self) -> str:
        return 'extractor.oracle_metadata'
Пример #19
0
class IndexExtractor(Extractor):
    """Base Index Extractor
    SQL language-specific classes should inherit from this class, but
    this class should not be called by itself.
    """

    # CONFIG KEYS
    WHERE_CLAUSE_SUFFIX_KEY = "where_clause_suffix"
    CLUSTER_KEY = "cluster_key"
    USE_CATALOG_AS_CLUSTER_NAME = "use_catalog_as_cluster_name"
    DATABASE_KEY = "database_key"
    CONN_STRING_KEY = "conn_string"

    # Default values
    DEFAULT_CLUSTER_NAME = "master"

    DEFAULT_CONFIG = ConfigFactory.from_dict({
        WHERE_CLAUSE_SUFFIX_KEY: "",
        CLUSTER_KEY: DEFAULT_CLUSTER_NAME,
        USE_CATALOG_AS_CLUSTER_NAME: True,
        SQLALCHEMY_CONN_STRING_KEY: None,
    })

    def init(self, conf):
        conf = conf.with_fallback(self.DEFAULT_CONFIG)

        self._cluster = "{}".format(conf.get_string(self.CLUSTER_KEY))

        self._database = conf.get_string(self.DATABASE_KEY)

        self.sql_stmt = self._get_sql_statement(
            use_catalog_as_cluster_name=conf.get_bool(
                self.USE_CATALOG_AS_CLUSTER_NAME),
            where_clause_suffix=conf.get_string(self.WHERE_CLAUSE_SUFFIX_KEY),
        )

        self._alchemy_extractor = SQLAlchemyExtractor()

        sql_alch_conf = Scoped.get_scoped_conf(
            conf, SQLALCHEMY_ENGINE_SCOPE).with_fallback(
                ConfigFactory.from_dict(
                    {SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))

        self.sql_stmt = sql_alch_conf.get_string(
            SQLAlchemyExtractor.EXTRACT_SQL)

        LOGGER.info("SQL for postgres metadata: %s", self.sql_stmt)

        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter: Union[None, iterator] = None

    def extract(self):
        if not self._extract_iter:
            self._extract_iter = self._get_extract_iter()
        try:
            extraction = next(self._extract_iter)
            return extraction
        except StopIteration:
            return None

    def _get_raw_extract_iter(self) -> Iterator[Dict[str, Any]]:
        """
        Provides iterator of result row from SQLAlchemy extractor
        :return:
        """
        row = self._alchemy_extractor.extract()
        while row:
            yield row
            row = self._alchemy_extractor.extract()

    def get_scope(self):
        return "extractor.markdown_index"
class Db2MetadataExtractor(Extractor):
    """
    Extracts Db2 table and column metadata from underlying meta store database using SQLAlchemyExtractor
    """
    # SELECT statement from Db2 SYSIBM to extract table and column metadata
    SQL_STATEMENT = """
    SELECT
      {cluster_source} as cluster, c.TABSCHEMA as schema, c.TABNAME as name, t.REMARKS as description,
      c.COLNAME as col_name,
      CASE WHEN c.TYPENAME='VARCHAR' OR c.TYPENAME='CHARACTER' THEN
      TRIM (TRAILING FROM c.TYPENAME) concat '(' concat c.LENGTH concat ')'
      WHEN c.TYPENAME='DECIMAL' THEN
      TRIM (TRAILING FROM c.TYPENAME) concat '(' concat c.LENGTH concat ',' concat c.SCALE concat ')'
      ELSE TRIM (TRAILING FROM c.TYPENAME) END as col_type,
      c.REMARKS as col_description, c.COLNO as col_sort_order
    FROM SYSCAT.COLUMNS c
    INNER JOIN
      SYSCAT.TABLES as t on c.TABSCHEMA=t.TABSCHEMA and c.TABNAME=t.TABNAME
    {where_clause_suffix}
    ORDER by cluster, schema, name, col_sort_order ;
    """

    # CONFIG KEYS
    WHERE_CLAUSE_SUFFIX_KEY = 'where_clause_suffix'
    CLUSTER_KEY = 'cluster_key'
    DATABASE_KEY = 'database_key'

    # Default values
    DEFAULT_CLUSTER_NAME = 'master'

    DEFAULT_CONFIG = ConfigFactory.from_dict({
        WHERE_CLAUSE_SUFFIX_KEY: ' ',
        CLUSTER_KEY: DEFAULT_CLUSTER_NAME
    })

    def init(self, conf):
        # type: (ConfigTree) -> None
        conf = conf.with_fallback(Db2MetadataExtractor.DEFAULT_CONFIG)
        self._cluster = '{}'.format(
            conf.get_string(Db2MetadataExtractor.CLUSTER_KEY))

        cluster_source = "'{}'".format(self._cluster)

        database = conf.get_string(Db2MetadataExtractor.DATABASE_KEY,
                                   default='db2')
        if six.PY2 and isinstance(database, six.text_type):
            database = database.encode('utf-8', 'ignore')

        self._database = database

        self.sql_stmt = Db2MetadataExtractor.SQL_STATEMENT.format(
            where_clause_suffix=conf.get_string(
                Db2MetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY),
            cluster_source=cluster_source)

        self._alchemy_extractor = SQLAlchemyExtractor()
        sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())\
            .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))

        self.sql_stmt = sql_alch_conf.get_string(
            SQLAlchemyExtractor.EXTRACT_SQL)

        LOGGER.info('SQL for Db2 metadata: {}'.format(self.sql_stmt))

        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter = None  # type: Union[None, Iterator]

    def extract(self):
        # type: () -> Union[TableMetadata, None]
        if not self._extract_iter:
            self._extract_iter = self._get_extract_iter()
        try:
            return next(self._extract_iter)
        except StopIteration:
            return None

    def get_scope(self):
        # type: () -> str
        return 'extractor.db2_metadata'

    def _get_extract_iter(self):
        # type: () -> Iterator[TableMetadata]
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """
        for key, group in groupby(self._get_raw_extract_iter(),
                                  self._get_table_key):
            columns = []

            for row in group:
                last_row = row
                columns.append(
                    ColumnMetadata(row['col_name'], row['col_description'],
                                   row['col_type'], row['col_sort_order']))

            yield TableMetadata(self._database, last_row['cluster'],
                                last_row['schema'], last_row['name'],
                                last_row['description'], columns)

    def _get_raw_extract_iter(self):
        # type: () -> Iterator[Dict[str, Any]]
        """
        Provides iterator of result row from SQLAlchemy extractor
        :return:
        """
        row = self._alchemy_extractor.extract()
        while row:
            yield row
            row = self._alchemy_extractor.extract()

    def _get_table_key(self, row):
        # type: (Dict[str, Any]) -> Union[TableKey, None]
        """
        Table key consists of schema and table name
        :param row:
        :return:
        """
        if row:
            return TableKey(schema=row['schema'], table_name=row['name'])

        return None
class AthenaMetadataExtractor(Extractor):
    """
    Extracts Athena table and column metadata from underlying meta store database using SQLAlchemyExtractor
    """

    SQL_STATEMENT = """
    SELECT
        {catalog_source} as cluster, table_schema as schema, table_name as name, column_name as col_name,
        data_type as col_type,ordinal_position as col_sort_order,
        comment as col_description, extra_info as extras from information_schema.columns
        {where_clause_suffix}
        ORDER by cluster, schema, name, col_sort_order ;
    """

    # CONFIG KEYS
    WHERE_CLAUSE_SUFFIX_KEY = 'where_clause_suffix'
    CATALOG_KEY = 'catalog_source'

    # Default values
    DEFAULT_CLUSTER_NAME = 'master'

    DEFAULT_CONFIG = ConfigFactory.from_dict({
        WHERE_CLAUSE_SUFFIX_KEY: ' ',
        CATALOG_KEY: DEFAULT_CLUSTER_NAME
    })

    def init(self, conf: ConfigTree) -> None:
        conf = conf.with_fallback(AthenaMetadataExtractor.DEFAULT_CONFIG)
        self._cluster = conf.get_string(AthenaMetadataExtractor.CATALOG_KEY)

        self.sql_stmt = AthenaMetadataExtractor.SQL_STATEMENT.format(
            where_clause_suffix=conf.get_string(
                AthenaMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY),
            catalog_source=self._cluster)

        LOGGER.info('SQL for Athena metadata: %s', self.sql_stmt)

        self._alchemy_extractor = SQLAlchemyExtractor()
        sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())\
            .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))

        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter: Union[None, Iterator] = None

    def extract(self) -> Union[TableMetadata, None]:
        if not self._extract_iter:
            self._extract_iter = self._get_extract_iter()
        try:
            return next(self._extract_iter)
        except StopIteration:
            return None

    def get_scope(self) -> str:
        return 'extractor.athena_metadata'

    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """

        for key, group in groupby(self._get_raw_extract_iter(),
                                  self._get_table_key):
            columns = []

            for row in group:
                last_row = row
                columns.append(
                    ColumnMetadata(
                        row['col_name'], row['extras'] if row['extras']
                        is not None else row['col_description'],
                        row['col_type'], row['col_sort_order']))

            yield TableMetadata('athena', last_row['cluster'],
                                last_row['schema'], last_row['name'], '',
                                columns)

    def _get_raw_extract_iter(self) -> Iterator[Dict[str, Any]]:
        """
        Provides iterator of result row from SQLAlchemy extractor
        :return:
        """
        row = self._alchemy_extractor.extract()
        while row:
            yield row
            row = self._alchemy_extractor.extract()

    def _get_table_key(self, row: Dict[str, Any]) -> Union[TableKey, None]:
        """
        Table key consists of schema and table name
        :param row:
        :return:
        """
        if row:
            return TableKey(schema=row['schema'], table_name=row['name'])

        return None
Пример #22
0
class DruidMetadataExtractor(Extractor):
    """
    Extracts Druid table and column metadata from druid using dbapi extractor
    """
    SQL_STATEMENT = textwrap.dedent("""
        SELECT
        TABLE_SCHEMA as schema,
        TABLE_NAME as name,
        COLUMN_NAME as col_name,
        DATA_TYPE as col_type,
        ORDINAL_POSITION as col_sort_order
        FROM INFORMATION_SCHEMA.COLUMNS
        {where_clause_suffix}
        order by TABLE_SCHEMA, TABLE_NAME, CAST(ORDINAL_POSITION AS int)
    """)

    # CONFIG KEYS
    WHERE_CLAUSE_SUFFIX_KEY = 'where_clause_suffix'
    CLUSTER_KEY = 'cluster'

    DEFAULT_CONFIG = ConfigFactory.from_dict({
        WHERE_CLAUSE_SUFFIX_KEY: ' ',
        CLUSTER_KEY: 'gold'
    })

    def init(self, conf: ConfigTree) -> None:
        conf = conf.with_fallback(DruidMetadataExtractor.DEFAULT_CONFIG)
        self._cluster = conf.get_string(DruidMetadataExtractor.CLUSTER_KEY)

        self.sql_stmt = DruidMetadataExtractor.SQL_STATEMENT.format(
            where_clause_suffix=conf.get_string(
                DruidMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY, default=''))

        self._alchemy_extractor = SQLAlchemyExtractor()
        sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())\
            .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))

        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter: Union[None, Iterator] = None

    def extract(self) -> Union[TableMetadata, None]:
        if not self._extract_iter:
            self._extract_iter = self._get_extract_iter()
        try:
            return next(self._extract_iter)
        except StopIteration:
            return None

    def get_scope(self) -> str:
        return 'extractor.druid_metadata'

    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """
        for key, group in groupby(self._get_raw_extract_iter(),
                                  self._get_table_key):
            columns = []
            # no table description and column description
            for row in group:
                last_row = row
                columns.append(
                    ColumnMetadata(name=row['col_name'],
                                   description='',
                                   col_type=row['col_type'],
                                   sort_order=row['col_sort_order']))
            yield TableMetadata(database='druid',
                                cluster=self._cluster,
                                schema=last_row['schema'],
                                name=last_row['name'],
                                description='',
                                columns=columns)

    def _get_raw_extract_iter(self) -> Iterator[Dict[str, Any]]:
        """
        Provides iterator of result row from dbapi extractor
        :return:
        """
        row = self._alchemy_extractor.extract()
        while row:
            yield row
            row = self._alchemy_extractor.extract()

    def _get_table_key(self, row: Dict[str, Any]) -> Union[TableKey, None]:
        """
        Table key consists of schema and table name
        :param row:
        :return:
        """
        if row:
            return TableKey(schema=row['schema'], table_name=row['name'])

        return None
Пример #23
0
class SqliteMetadataExtractor(Extractor):
    """
    Extracts Sqlite table and column metadata from underlying meta store database using SQLAlchemyExtractor
    """

    # CONFIG KEYS
    WHERE_CLAUSE_SUFFIX_KEY = "where_clause_suffix"
    CLUSTER_KEY = "cluster_key"
    USE_CATALOG_AS_CLUSTER_NAME = "use_catalog_as_cluster_name"
    DATABASE_KEY = "database_key"

    # Default values
    DEFAULT_CLUSTER_NAME = "master"

    DEFAULT_CONFIG = ConfigFactory.from_dict({
        WHERE_CLAUSE_SUFFIX_KEY: "1=1",
        CLUSTER_KEY: DEFAULT_CLUSTER_NAME,
        USE_CATALOG_AS_CLUSTER_NAME: True,
    })

    SQL_STATEMENT = """
            SELECT
                '{cluster_source}' as cluster,
                '' as schema,
                m.name as name,
                '' as description,
                p.name as col_name,
                p.type as col_type,
                '' as col_description,
                row_number() over win as col_sort_order
            FROM
                sqlite_master AS m
            JOIN
                pragma_table_info(m.name) AS p
            WHERE
                m.type = 'table' AND {where_clause_suffix}
            WINDOW WIN as (ORDER BY m.name, p.name)
    """
    where = "p.type like 'text' or p.type like 'varchar%' or p.type like 'char%'"

    def init(self, conf: ConfigTree) -> None:
        conf = conf.with_fallback(SqliteMetadataExtractor.DEFAULT_CONFIG)
        self._cluster = conf.get_string(SqliteMetadataExtractor.CLUSTER_KEY)

        self._database = conf.get_string(SqliteMetadataExtractor.DATABASE_KEY,
                                         default="sqlite")

        self.sql_stmt = SqliteMetadataExtractor.SQL_STATEMENT.format(
            where_clause_suffix=conf.get_string(
                SqliteMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY),
            cluster_source=self._cluster,
        )

        self._alchemy_extractor = SQLAlchemyExtractor()
        sql_alch_conf = Scoped.get_scoped_conf(
            conf, self._alchemy_extractor.get_scope()).with_fallback(
                ConfigFactory.from_dict(
                    {SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))

        self.sql_stmt = sql_alch_conf.get_string(
            SQLAlchemyExtractor.EXTRACT_SQL)

        LOGGER.info("SQL for sqlite metadata: %s", self.sql_stmt)

        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter: Union[None, Iterator] = None

    def extract(self) -> Union[TableMetadata, None]:
        if not self._extract_iter:
            self._extract_iter = self._get_extract_iter()
        try:
            return next(self._extract_iter)
        except StopIteration:
            return None

    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """
        for key, group in groupby(self._get_raw_extract_iter(),
                                  self._get_table_key):
            columns = []

            for row in group:
                last_row = row
                columns.append(
                    ColumnMetadata(
                        row["col_name"],
                        row["col_description"],
                        row["col_type"],
                        row["col_sort_order"],
                    ))

            yield TableMetadata(
                self._database,
                last_row["cluster"],
                last_row["schema"],
                last_row["name"],
                last_row["description"],
                columns,
            )

    def _get_raw_extract_iter(self) -> Iterator[Dict[str, Any]]:
        """
        Provides iterator of result row from SQLAlchemy extractor
        :return:
        """
        row = self._alchemy_extractor.extract()
        while row:
            yield row
            row = self._alchemy_extractor.extract()

    def _get_table_key(self, row: Dict[str, Any]) -> Union[TableKey, None]:
        """
        Table key consists of schema and table name
        :param row:
        :return:
        """
        if row:
            return TableKey(schema=row["schema"], table_name=row["name"])

        return None
Пример #24
0
class MysqlMetadataExtractor(Extractor):
    """
    Extracts mysql table and column metadata from underlying meta store database using SQLAlchemyExtractor
    """
    # SELECT statement from mysql information_schema to extract table and column metadata
    SQL_STATEMENT = """
        SELECT
        lower(c.column_name) AS col_name,
        c.column_comment AS col_description,
        lower(c.data_type) AS col_type,
        lower(c.ordinal_position) AS col_sort_order,
        {cluster_source} AS cluster,
        lower(c.table_schema) AS "schema",
        lower(c.table_name) AS name,
        t.table_comment AS description,
        case when lower(t.table_type) = "view" then "true" else "false" end AS is_view
        FROM
        INFORMATION_SCHEMA.COLUMNS AS c
        LEFT JOIN
        INFORMATION_SCHEMA.TABLES t
            ON c.TABLE_NAME = t.TABLE_NAME
            AND c.TABLE_SCHEMA = t.TABLE_SCHEMA
        {where_clause_suffix}
        ORDER by cluster, "schema", name, col_sort_order ;
    """

    # CONFIG KEYS
    WHERE_CLAUSE_SUFFIX_KEY = 'where_clause_suffix'
    CLUSTER_KEY = 'cluster_key'
    USE_CATALOG_AS_CLUSTER_NAME = 'use_catalog_as_cluster_name'
    DATABASE_KEY = 'database_key'

    # Default values
    DEFAULT_CLUSTER_NAME = 'master'

    DEFAULT_CONFIG = ConfigFactory.from_dict(
        {WHERE_CLAUSE_SUFFIX_KEY: ' ', CLUSTER_KEY: DEFAULT_CLUSTER_NAME, USE_CATALOG_AS_CLUSTER_NAME: True}
    )

    def init(self, conf):
        # type: (ConfigTree) -> None
        conf = conf.with_fallback(MysqlMetadataExtractor.DEFAULT_CONFIG)
        self._cluster = '{}'.format(conf.get_string(MysqlMetadataExtractor.CLUSTER_KEY))

        if conf.get_bool(MysqlMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME):
            cluster_source = "c.table_catalog"
        else:
            cluster_source = "'{}'".format(self._cluster)

        database = conf.get_string(MysqlMetadataExtractor.DATABASE_KEY, default='mysql')
        if six.PY2 and isinstance(database, six.text_type):
            database = database.encode('utf-8', 'ignore')

        self._database = database

        self.sql_stmt = MysqlMetadataExtractor.SQL_STATEMENT.format(
            where_clause_suffix=conf.get_string(MysqlMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY),
            cluster_source=cluster_source
        )

        self._alchemy_extractor = SQLAlchemyExtractor()
        sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())\
            .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))

        self.sql_stmt = sql_alch_conf.get_string(SQLAlchemyExtractor.EXTRACT_SQL)

        LOGGER.info('SQL for mysql metadata: {}'.format(self.sql_stmt))

        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter = None  # type: Union[None, Iterator]

    def extract(self):
        # type: () -> Union[TableMetadata, None]
        if not self._extract_iter:
            self._extract_iter = self._get_extract_iter()
        try:
            return next(self._extract_iter)
        except StopIteration:
            return None

    def get_scope(self):
        # type: () -> str
        return 'extractor.mysql_metadata'

    def _get_extract_iter(self):
        # type: () -> Iterator[TableMetadata]
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """
        for key, group in groupby(self._get_raw_extract_iter(), self._get_table_key):
            columns = []

            for row in group:
                last_row = row
                columns.append(ColumnMetadata(row['col_name'], row['col_description'],
                                              row['col_type'], row['col_sort_order']))

            yield TableMetadata(self._database, last_row['cluster'],
                                last_row['schema'],
                                last_row['name'],
                                last_row['description'],
                                columns,
                                is_view=last_row['is_view'])

    def _get_raw_extract_iter(self):
        # type: () -> Iterator[Dict[str, Any]]
        """
        Provides iterator of result row from SQLAlchemy extractor
        :return:
        """
        row = self._alchemy_extractor.extract()
        while row:
            yield row
            row = self._alchemy_extractor.extract()

    def _get_table_key(self, row):
        # type: (Dict[str, Any]) -> Union[TableKey, None]
        """
        Table key consists of schema and table name
        :param row:
        :return:
        """
        if row:
            return TableKey(schema=row['schema'], table_name=row['name'])

        return None
class HiveTableMetadataExtractor(Extractor):
    """
    Extracts Hive table and column metadata from underlying meta store database using SQLAlchemyExtractor
    """
    EXTRACT_SQL = 'extract_sql'
    # SELECT statement from hive metastore database to extract table and column metadata
    # Below SELECT statement uses UNION to combining two queries together.
    # 1st query is retrieving partition columns
    # 2nd query is retrieving columns
    # Using UNION to combine above two statements and order by table & partition identifier.
    DEFAULT_SQL_STATEMENT = """
    SELECT source.* FROM
    (SELECT t.TBL_ID, d.NAME as `schema`, t.TBL_NAME name, t.TBL_TYPE, tp.PARAM_VALUE as description,
           p.PKEY_NAME as col_name, p.INTEGER_IDX as col_sort_order,
           p.PKEY_TYPE as col_type, p.PKEY_COMMENT as col_description, 1 as "is_partition_col",
           IF(t.TBL_TYPE = 'VIRTUAL_VIEW', 1, 0) "is_view"
    FROM TBLS t
    JOIN DBS d ON t.DB_ID = d.DB_ID
    JOIN PARTITION_KEYS p ON t.TBL_ID = p.TBL_ID
    LEFT JOIN TABLE_PARAMS tp ON (t.TBL_ID = tp.TBL_ID AND tp.PARAM_KEY='comment')
    {where_clause_suffix}
    UNION
    SELECT t.TBL_ID, d.NAME as `schema`, t.TBL_NAME name, t.TBL_TYPE, tp.PARAM_VALUE as description,
           c.COLUMN_NAME as col_name, c.INTEGER_IDX as col_sort_order,
           c.TYPE_NAME as col_type, c.COMMENT as col_description, 0 as "is_partition_col",
           IF(t.TBL_TYPE = 'VIRTUAL_VIEW', 1, 0) "is_view"
    FROM TBLS t
    JOIN DBS d ON t.DB_ID = d.DB_ID
    JOIN SDS s ON t.SD_ID = s.SD_ID
    JOIN COLUMNS_V2 c ON s.CD_ID = c.CD_ID
    LEFT JOIN TABLE_PARAMS tp ON (t.TBL_ID = tp.TBL_ID AND tp.PARAM_KEY='comment')
    {where_clause_suffix}
    ) source
    ORDER by tbl_id, is_partition_col desc;
    """

    DEFAULT_POSTGRES_SQL_STATEMENT = """
    SELECT source.* FROM
    (SELECT t."TBL_ID" as tbl_id, d."NAME" as "schema", t."TBL_NAME" as name, t."TBL_TYPE",
           tp."PARAM_VALUE" as description, p."PKEY_NAME" as col_name, p."INTEGER_IDX" as col_sort_order,
           p."PKEY_TYPE" as col_type, p."PKEY_COMMENT" as col_description, 1 as "is_partition_col",
           CASE WHEN t."TBL_TYPE" = 'VIRTUAL_VIEW' THEN 1
                ELSE 0 END as "is_view"
    FROM "TBLS" t
    JOIN "DBS" d ON t."DB_ID" = d."DB_ID"
    JOIN "PARTITION_KEYS" p ON t."TBL_ID" = p."TBL_ID"
    LEFT JOIN "TABLE_PARAMS" tp ON (t."TBL_ID" = tp."TBL_ID" AND tp."PARAM_KEY"='comment')
    {where_clause_suffix}
    UNION
    SELECT t."TBL_ID" as tbl_id, d."NAME" as "schema", t."TBL_NAME" as name, t."TBL_TYPE",
           tp."PARAM_VALUE" as description, c."COLUMN_NAME" as col_name, c."INTEGER_IDX" as col_sort_order,
           c."TYPE_NAME" as col_type, c."COMMENT" as col_description, 0 as "is_partition_col",
           CASE WHEN t."TBL_TYPE" = 'VIRTUAL_VIEW' THEN 1
                ELSE 0 END as "is_view"
    FROM "TBLS" t
    JOIN "DBS" d ON t."DB_ID" = d."DB_ID"
    JOIN "SDS" s ON t."SD_ID" = s."SD_ID"
    JOIN "COLUMNS_V2" c ON s."CD_ID" = c."CD_ID"
    LEFT JOIN "TABLE_PARAMS" tp ON (t."TBL_ID" = tp."TBL_ID" AND tp."PARAM_KEY"='comment')
    {where_clause_suffix}
    ) source
    ORDER by tbl_id, is_partition_col desc;
    """

    # CONFIG KEYS
    WHERE_CLAUSE_SUFFIX_KEY = 'where_clause_suffix'
    CLUSTER_KEY = 'cluster'

    DEFAULT_CONFIG = ConfigFactory.from_dict({WHERE_CLAUSE_SUFFIX_KEY: ' ',
                                              CLUSTER_KEY: 'gold'})

    def init(self, conf: ConfigTree) -> None:
        conf = conf.with_fallback(HiveTableMetadataExtractor.DEFAULT_CONFIG)
        self._cluster = '{}'.format(conf.get_string(HiveTableMetadataExtractor.CLUSTER_KEY))

        self._alchemy_extractor = SQLAlchemyExtractor()

        sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())
        default_sql = self._choose_default_sql_stm(sql_alch_conf).format(
            where_clause_suffix=conf.get_string(HiveTableMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY))

        self.sql_stmt = conf.get_string(HiveTableMetadataExtractor.EXTRACT_SQL, default=default_sql)

        LOGGER.info('SQL for hive metastore: {}'.format(self.sql_stmt))

        sql_alch_conf = sql_alch_conf.with_fallback(ConfigFactory.from_dict(
            {SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))
        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter: Union[None, Iterator] = None

    def _choose_default_sql_stm(self, conf: ConfigTree) -> str:
        url = make_url(conf.get_string(SQLAlchemyExtractor.CONN_STRING))
        if url.drivername.lower() in ['postgresql', 'postgres']:
            return HiveTableMetadataExtractor.DEFAULT_POSTGRES_SQL_STATEMENT
        else:
            return HiveTableMetadataExtractor.DEFAULT_SQL_STATEMENT

    def extract(self) -> Union[TableMetadata, None]:
        if not self._extract_iter:
            self._extract_iter = self._get_extract_iter()
        try:
            return next(self._extract_iter)
        except StopIteration:
            return None

    def get_scope(self) -> str:
        return 'extractor.hive_table_metadata'

    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """
        for key, group in groupby(self._get_raw_extract_iter(), self._get_table_key):
            columns = []

            for row in group:
                last_row = row
                column = None
                if row['is_partition_col'] == 1:
                    # create add a badge to indicate partition column
                    column = ColumnMetadata(row['col_name'], row['col_description'],
                                            row['col_type'], row['col_sort_order'], [PARTITION_BADGE])
                else:
                    column = ColumnMetadata(row['col_name'], row['col_description'],
                                            row['col_type'], row['col_sort_order'])
                columns.append(column)
            is_view = last_row['is_view'] == 1
            yield TableMetadata('hive', self._cluster,
                                last_row['schema'],
                                last_row['name'],
                                last_row['description'],
                                columns,
                                is_view=is_view)

    def _get_raw_extract_iter(self) -> Iterator[Dict[str, Any]]:
        """
        Provides iterator of result row from SQLAlchemy extractor
        :return:
        """
        row = self._alchemy_extractor.extract()
        while row:
            yield row
            row = self._alchemy_extractor.extract()

    def _get_table_key(self, row: Dict[str, Any]) -> Union[TableKey, None]:
        """
        Table key consists of schema and table name
        :param row:
        :return:
        """
        if row:
            return TableKey(schema=row['schema'], table_name=row['name'])

        return None
Пример #26
0
class PostgresMetadataExtractor(Extractor):
    """
    Extracts Postgres table and column metadata from underlying meta store database using SQLAlchemyExtractor
    """
    # SELECT statement from postgres information_schema to extract table and column metadata
    SQL_STATEMENT = """
    SELECT
      {cluster_source} as cluster, c.table_schema as schema_name, c.table_name as name, pgtd.description as description
      ,c.column_name as col_name, c.data_type as col_type
      , pgcd.description as col_description, ordinal_position as col_sort_order
    FROM INFORMATION_SCHEMA.COLUMNS c
    INNER JOIN
      pg_catalog.pg_statio_all_tables as st on c.table_schema=st.schemaname and c.table_name=st.relname
    LEFT JOIN
      pg_catalog.pg_description pgcd on pgcd.objoid=st.relid and pgcd.objsubid=c.ordinal_position
    LEFT JOIN
      pg_catalog.pg_description pgtd on pgtd.objoid=st.relid and pgtd.objsubid=0
    {where_clause_suffix}
    ORDER by cluster, schema_name, name, col_sort_order ;
    """

    # CONFIG KEYS
    WHERE_CLAUSE_SUFFIX_KEY = 'where_clause_suffix'
    CLUSTER_KEY = 'cluster_key'
    USE_CATALOG_AS_CLUSTER_NAME = 'use_catalog_as_cluster_name'
    DATABASE_KEY = 'database_key'

    # Default values
    DEFAULT_CLUSTER_NAME = 'master'

    DEFAULT_CONFIG = ConfigFactory.from_dict({
        WHERE_CLAUSE_SUFFIX_KEY: ' ',
        CLUSTER_KEY: DEFAULT_CLUSTER_NAME,
        USE_CATALOG_AS_CLUSTER_NAME: True
    })

    def init(self, conf):
        # type: (ConfigTree) -> None
        conf = conf.with_fallback(PostgresMetadataExtractor.DEFAULT_CONFIG)
        self._cluster = '{}'.format(
            conf.get_string(PostgresMetadataExtractor.CLUSTER_KEY))

        if conf.get_bool(
                PostgresMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME):
            cluster_source = "c.table_catalog"
        else:
            cluster_source = "'{}'".format(self._cluster)

        self._database = conf.get_string(
            PostgresMetadataExtractor.DATABASE_KEY,
            default='postgres').encode('utf-8', 'ignore')

        self.sql_stmt = PostgresMetadataExtractor.SQL_STATEMENT.format(
            where_clause_suffix=conf.get_string(
                PostgresMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY),
            cluster_source=cluster_source)

        LOGGER.info('SQL for postgres metadata: {}'.format(self.sql_stmt))

        self._alchemy_extractor = SQLAlchemyExtractor()
        sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())\
            .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))

        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter = None  # type: Union[None, Iterator]

    def extract(self):
        # type: () -> Union[TableMetadata, None]
        if not self._extract_iter:
            self._extract_iter = self._get_extract_iter()
        try:
            return next(self._extract_iter)
        except StopIteration:
            return None

    def get_scope(self):
        # type: () -> str
        return 'extractor.postgres_metadata'

    def _get_extract_iter(self):
        # type: () -> Iterator[TableMetadata]
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """
        for key, group in groupby(self._get_raw_extract_iter(),
                                  self._get_table_key):
            columns = []

            for row in group:
                last_row = row
                columns.append(
                    ColumnMetadata(row['col_name'], row['col_description'],
                                   row['col_type'], row['col_sort_order']))

            yield TableMetadata(self._database, last_row['cluster'],
                                last_row['schema_name'], last_row['name'],
                                last_row['description'], columns)

    def _get_raw_extract_iter(self):
        # type: () -> Iterator[Dict[str, Any]]
        """
        Provides iterator of result row from SQLAlchemy extractor
        :return:
        """
        row = self._alchemy_extractor.extract()
        while row:
            yield row
            row = self._alchemy_extractor.extract()

    def _get_table_key(self, row):
        # type: (Dict[str, Any]) -> Union[TableKey, None]
        """
        Table key consists of schema and table name
        :param row:
        :return:
        """
        if row:
            return TableKey(schema_name=row['schema_name'],
                            table_name=row['name'])

        return None
Пример #27
0
class SnowflakeMetadataExtractor(Extractor):
    """
    Extracts Snowflake table and column metadata from underlying meta store database using SQLAlchemyExtractor.
    Requirements:
        snowflake-connector-python
        snowflake-sqlalchemy
    """
    # SELECT statement from snowflake information_schema to extract table and column metadata
    SQL_STATEMENT = """
    SELECT
        lower(c.column_name) AS col_name,
        c.comment AS col_description,
        lower(c.data_type) AS col_type,
        lower(c.ordinal_position) AS col_sort_order,
        lower(c.table_catalog) AS database,
        lower({cluster_source}) AS cluster,
        lower(c.table_schema) AS schema_name,
        lower(c.table_name) AS name,
        t.comment AS description,
        decode(lower(t.table_type), 'view', 'true', 'false') AS is_view
    FROM
        {database}.INFORMATION_SCHEMA.COLUMNS AS c
    LEFT JOIN
        {database}.INFORMATION_SCHEMA.TABLES t
            ON c.TABLE_NAME = t.TABLE_NAME
            AND c.TABLE_SCHEMA = t.TABLE_SCHEMA
    {where_clause_suffix};
    """

    # CONFIG KEYS
    WHERE_CLAUSE_SUFFIX_KEY = 'where_clause_suffix'
    CLUSTER_KEY = 'cluster_key'
    USE_CATALOG_AS_CLUSTER_NAME = 'use_catalog_as_cluster_name'
    DATABASE_KEY = 'database_key'

    # Default values
    DEFAULT_CLUSTER_NAME = 'master'

    DEFAULT_CONFIG = ConfigFactory.from_dict({
        WHERE_CLAUSE_SUFFIX_KEY: ' ',
        CLUSTER_KEY: DEFAULT_CLUSTER_NAME,
        USE_CATALOG_AS_CLUSTER_NAME: True,
        DATABASE_KEY: 'prod'
    })

    def init(self, conf):
        # type: (ConfigTree) -> None
        conf = conf.with_fallback(SnowflakeMetadataExtractor.DEFAULT_CONFIG)
        self._cluster = '{}'.format(
            conf.get_string(SnowflakeMetadataExtractor.CLUSTER_KEY))

        if conf.get_bool(
                SnowflakeMetadataExtractor.USE_CATALOG_AS_CLUSTER_NAME):
            cluster_source = "c.table_catalog"
        else:
            cluster_source = "'{}'".format(self._cluster)

        self._database = conf.get_string(
            SnowflakeMetadataExtractor.DATABASE_KEY).encode('utf-8', 'ignore')

        self.sql_stmt = SnowflakeMetadataExtractor.SQL_STATEMENT.format(
            where_clause_suffix=conf.get_string(
                SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY),
            cluster_source=cluster_source,
            database=self._database)

        LOGGER.info('SQL for snowflake metadata: {}'.format(self.sql_stmt))

        self._alchemy_extractor = SQLAlchemyExtractor()
        sql_alch_conf = Scoped.get_scoped_conf(conf, self._alchemy_extractor.get_scope())\
            .with_fallback(ConfigFactory.from_dict({SQLAlchemyExtractor.EXTRACT_SQL: self.sql_stmt}))

        self._alchemy_extractor.init(sql_alch_conf)
        self._extract_iter = None  # type: Union[None, Iterator]

    def extract(self):
        # type: () -> Union[TableMetadata, None]
        if not self._extract_iter:
            self._extract_iter = self._get_extract_iter()
        try:
            return next(self._extract_iter)
        except StopIteration:
            return None

    def get_scope(self):
        # type: () -> str
        return 'extractor.snowflake'

    def _get_extract_iter(self):
        # type: () -> Iterator[TableMetadata]
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """
        for key, group in groupby(self._get_raw_extract_iter(),
                                  self._get_table_key):
            columns = []

            for row in group:
                last_row = row
                columns.append(
                    ColumnMetadata(
                        row['col_name'],
                        unidecode(row['col_description'])
                        if row['col_description'] else None, row['col_type'],
                        row['col_sort_order']))

            yield TableMetadata(
                self._database, last_row['cluster'], last_row['schema_name'],
                last_row['name'],
                unidecode(last_row['description'])
                if last_row['description'] else None, columns)

    def _get_raw_extract_iter(self):
        # type: () -> Iterator[Dict[str, Any]]
        """
        Provides iterator of result row from SQLAlchemy extractor
        :return:
        """
        row = self._alchemy_extractor.extract()
        while row:
            yield row
            row = self._alchemy_extractor.extract()

    def _get_table_key(self, row):
        # type: (Dict[str, Any]) -> Union[TableKey, None]
        """
        Table key consists of schema and table name
        :param row:
        :return:
        """
        if row:
            return TableKey(schema_name=row['schema_name'],
                            table_name=row['name'])

        return None