Exemplo n.º 1
0
 def get_inspectors(self) -> Iterable[Inspector]:
     # This method can be overridden in the case that you want to dynamically
     # run on multiple databases.
     url = self.config.get_sql_alchemy_url()
     logger.debug(f"sql_alchemy_url={url}")
     engine = create_engine(url, **self.config.options)
     with engine.connect() as conn:
         self.get_catalog_metadata(conn)
         inspector = inspect(conn)
         yield inspector
Exemplo n.º 2
0
    def _get_all_tables(self) -> Set[str]:
        all_tables_query: str = textwrap.dedent("""\
        SELECT database, name AS table_name
          FROM system.tables
         WHERE name NOT LIKE '.inner%'""")

        all_tables_set = set()

        url = self.config.get_sql_alchemy_url()
        logger.debug(f"sql_alchemy_url={url}")
        engine = create_engine(url, **self.config.options)
        for db_row in engine.execute(text(all_tables_query)):
            all_tables_set.add(f'{db_row["database"]}.{db_row["table_name"]}')

        return all_tables_set
Exemplo n.º 3
0
    def _get_all_tables(self) -> Set[str]:
        all_tables_query: str = """
        select
            table_schema as schemaname,
            table_name as tablename
        from
            information_schema.tables
        where
            table_type = 'BASE TABLE'
            and table_schema not in ('information_schema', 'pg_catalog', 'pg_internal')
        union
        select
            distinct schemaname,
            tablename
        from
            svv_external_tables
        union
        SELECT
            n.nspname AS schemaname
            ,c.relname AS tablename
        FROM
            pg_catalog.pg_class AS c
        INNER JOIN
            pg_catalog.pg_namespace AS n
            ON c.relnamespace = n.oid
        WHERE relkind = 'v'
        and
        n.nspname not in ('pg_catalog', 'information_schema')

        """
        db_name = getattr(self.config, "database")
        db_alias = getattr(self.config, "database_alias")
        if db_alias:
            db_name = db_alias

        all_tables_set = set()

        url = self.config.get_sql_alchemy_url()
        logger.debug(f"sql_alchemy_url={url}")
        engine = create_engine(url, **self.config.options)
        for db_row in engine.execute(all_tables_query):
            all_tables_set.add(
                f'{db_name}.{db_row["schemaname"]}.{db_row["tablename"]}')

        return all_tables_set
Exemplo n.º 4
0
    def get_catalog_metadata(self, conn: Connection) -> None:
        catalog_metadata = _get_external_db_mapping(conn)
        if catalog_metadata is None:
            return
        db_name = self.get_db_name()

        external_schema_mapping = {}
        for rel in catalog_metadata:
            if rel.eskind != 1:
                logger.debug(
                    f"Skipping {rel.schemaname} for mapping to external database as currently we only "
                    f"support glue"
                )
                continue
            external_schema_mapping[rel.schemaname] = {
                "eskind": rel.eskind,
                "external_database": rel.databasename,
                "esoptions": rel.esoptions,
                "esoid": rel.esoid,
                "esowner": rel.esowner,
            }
        self.catalog_metadata[db_name] = external_schema_mapping
Exemplo n.º 5
0
    def _populate_lineage_map(self, query: str,
                              lineage_type: LineageCollectorType) -> None:
        """
        This method generate table level lineage based with the given query.
        The query should return the following columns: target_schema, target_table, source_table, source_schema

        :param query: The query to run to extract lineage.
        :type query: str
        :param lineage_type: The way the lineage should be processed
        :type lineage_type: LineageType
        return: The method does not return with anything as it directly modify the self._lineage_map property.
        :rtype: None
        """
        assert self._lineage_map is not None

        if not self._all_tables_set:
            self._all_tables_set = self._get_all_tables()

        url = self.config.get_sql_alchemy_url()
        logger.debug(f"sql_alchemy_url={url}")
        engine = create_engine(url, **self.config.options)

        try:
            for db_row in engine.execute(text(query)):

                if not self.config.schema_pattern.allowed(
                        db_row["target_schema"]
                ) or not self.config.table_pattern.allowed(
                        db_row["target_table"]):
                    continue

                # Target
                target_path = (
                    f'{self.config.platform_instance+"." if self.config.platform_instance else ""}'
                    f'{db_row["target_schema"]}.{db_row["target_table"]}')
                target = LineageItem(
                    dataset=LineageDataset(
                        platform=LineageDatasetPlatform.CLICKHOUSE,
                        path=target_path),
                    upstreams=set(),
                    collector_type=lineage_type,
                )

                # Source
                platform = LineageDatasetPlatform.CLICKHOUSE
                path = f'{db_row["source_schema"]}.{db_row["source_table"]}'

                sources = [LineageDataset(
                    platform=platform,
                    path=path,
                )]

                for source in sources:
                    # Filtering out tables which does not exist in ClickHouse
                    # It was deleted in the meantime
                    if (source.platform == LineageDatasetPlatform.CLICKHOUSE
                            and source.path not in self._all_tables_set):
                        logger.warning(f"{source.path} missing table")
                        continue

                    target.upstreams.add(source)

                # Merging downstreams if dataset already exists and has downstreams
                if target.dataset.path in self._lineage_map:

                    self._lineage_map[
                        target.dataset.path].upstreams = self._lineage_map[
                            target.dataset.path].upstreams.union(
                                target.upstreams)

                else:
                    self._lineage_map[target.dataset.path] = target

                logger.info(
                    f"Lineage[{target}]:{self._lineage_map[target.dataset.path]}"
                )

        except Exception as e:
            logger.warning(
                f"Extracting {lineage_type.name} lineage from ClickHouse failed."
                f"Continuing...\nError was {e}.")
Exemplo n.º 6
0
    def _populate_lineage_map(self, query: str,
                              lineage_type: LineageCollectorType) -> None:
        """
        This method generate table level lineage based with the given query.
        The query should return the following columns: target_schema, target_table, source_table, source_schema
        source_table and source_schema can be omitted if the sql_field is set because then it assumes the source_table
        and source_schema will be extracted from the sql_field by sql parsing.

        :param query: The query to run to extract lineage.
        :type query: str
        :param lineage_type: The way the lineage should be processed
        :type lineage_type: LineageType
        return: The method does not return with anything as it directly modify the self._lineage_map property.
        :rtype: None
        """
        assert self._lineage_map is not None

        if not self._all_tables_set:
            self._all_tables_set = self._get_all_tables()

        url = self.config.get_sql_alchemy_url()
        logger.debug(f"sql_alchemy_url={url}")
        engine = create_engine(url, **self.config.options)

        db_name = self._get_db_name()

        try:
            for db_row in engine.execute(query):

                if not self.config.schema_pattern.allowed(
                        db_row["target_schema"]
                ) or not self.config.table_pattern.allowed(
                        db_row["target_table"]):
                    continue

                # Target
                target_path = (
                    f'{db_name}.{db_row["target_schema"]}.{db_row["target_table"]}'
                )
                target = LineageItem(
                    dataset=LineageDataset(
                        platform=LineageDatasetPlatform.REDSHIFT,
                        path=target_path),
                    upstreams=set(),
                    collector_type=lineage_type,
                    query_parser_failed_sqls=list(),
                )

                sources: List[LineageDataset] = list()
                # Source
                if lineage_type in [
                        lineage_type.QUERY_SQL_PARSER,
                        lineage_type.NON_BINDING_VIEW,
                ]:
                    try:
                        sources = self._get_sources_from_query(
                            db_name=db_name, query=db_row["ddl"])
                    except Exception as e:
                        target.query_parser_failed_sqls.append(db_row["ddl"])
                        logger.warning(
                            f'Error parsing query {db_row["ddl"]} for getting lineage .'
                            f"\nError was {e}.")
                else:
                    if lineage_type == lineage_type.COPY:
                        platform = LineageDatasetPlatform.S3
                        path = db_row["filename"].strip()
                        if urlparse(path).scheme != "s3":
                            logger.warning(
                                f"Only s3 source supported with copy. The source was: {path}.  ."
                            )
                            continue
                    else:
                        platform = LineageDatasetPlatform.REDSHIFT
                        path = f'{db_name}.{db_row["source_schema"]}.{db_row["source_table"]}'

                    sources = [LineageDataset(
                        platform=platform,
                        path=path,
                    )]

                for source in sources:
                    # Filtering out tables which does not exist in Redshift
                    # It was deleted in the meantime or query parser did not capture well the table name
                    if (source.platform == LineageDatasetPlatform.REDSHIFT
                            and source.path not in self._all_tables_set):
                        logger.warning(f"{source.path} missing table")
                        continue

                    target.upstreams.add(source)

                # Merging downstreams if dataset already exists and has downstreams
                if target.dataset.path in self._lineage_map:

                    self._lineage_map[
                        target.dataset.path].upstreams = self._lineage_map[
                            target.dataset.path].upstreams.union(
                                target.upstreams)

                else:
                    self._lineage_map[target.dataset.path] = target

                logger.info(
                    f"Lineage[{target}]:{self._lineage_map[target.dataset.path]}"
                )

        except Exception as e:
            logger.warning(
                f"Extracting {lineage_type.name} lineage from Redshift failed."
                f"Continuing...\nError was {e}.")