Пример #1
0
def test_catalog_config_file(mocker, tmpdir):
    config_file = tmpdir / "catalog.yml"
    with config_file.open("w") as f:
        f.write("test_catalog_config")
    mocker.patch("dbcat.api.catalog_connection_yaml")
    mocker.patch("dbcat.api.init_db")
    open_catalog(app_dir=tmpdir, secret=settings.DEFAULT_CATALOG_SECRET)
    dbcat.api.catalog_connection_yaml.assert_called_once_with("test_catalog_config")
Пример #2
0
def test_catalog_config_file(mocker, tmpdir):
    config_file = tmpdir / "catalog.yml"
    with config_file.open("w") as f:
        f.write("test_catalog_config")
    mocker.patch("dbcat.api.catalog_connection_yaml")
    open_catalog(app_dir=tmpdir)
    dbcat.api.catalog_connection_yaml.assert_called_once_with(
        "test_catalog_config")
Пример #3
0
def add_athena(
    name: str = typer.Option(..., help="A memorable name for the database"),
    aws_access_key_id: str = typer.Option(..., help="AWS Access Key"),
    aws_secret_access_key: str = typer.Option(..., help="AWS Secret Key"),
    region_name: str = typer.Option(..., help="AWS Region Name"),
    s3_staging_dir: str = typer.Option(..., help="S3 Staging Dir"),
):
    catalog = open_catalog(
        app_dir=dbcat.settings.APP_DIR,
        secret=dbcat.settings.CATALOG_SECRET,
        path=dbcat.settings.CATALOG_PATH,
        host=dbcat.settings.CATALOG_HOST,
        port=dbcat.settings.CATALOG_PORT,
        user=dbcat.settings.CATALOG_USER,
        password=dbcat.settings.CATALOG_PASSWORD,
        database=dbcat.settings.CATALOG_DB,
    )
    with closing(catalog):
        with catalog.managed_session:
            add_athena_source(
                catalog=catalog,
                name=name,
                aws_access_key_id=aws_access_key_id,
                aws_secret_access_key=aws_secret_access_key,
                region_name=region_name,
                s3_staging_dir=s3_staging_dir,
            )
        typer.echo("Registered AWS Athena {}".format(name))
Пример #4
0
def add_snowflake(
    name: str = typer.Option(..., help="A memorable name for the database"),
    username: str = typer.Option(..., help="Username or role to connect database"),
    password: str = typer.Option(..., help="Password of username or role"),
    database: str = typer.Option(..., help="Database name"),
    account: str = typer.Option(..., help="Snowflake Account Name"),
    warehouse: str = typer.Option(..., help="Snowflake Warehouse Name"),
    role: str = typer.Option(..., help="Snowflake Role Name"),
):
    catalog = open_catalog(
        app_dir=dbcat.settings.APP_DIR,
        secret=dbcat.settings.CATALOG_SECRET,
        path=dbcat.settings.CATALOG_PATH,
        host=dbcat.settings.CATALOG_HOST,
        port=dbcat.settings.CATALOG_PORT,
        user=dbcat.settings.CATALOG_USER,
        password=dbcat.settings.CATALOG_PASSWORD,
        database=dbcat.settings.CATALOG_DB,
    )
    with closing(catalog):
        with catalog.managed_session:
            add_snowflake_source(
                catalog=catalog,
                name=name,
                username=username,
                password=password,
                database=database,
                account=account,
                warehouse=warehouse,
                role=role,
            )
        typer.echo("Registered Snowflake database {}".format(name))
Пример #5
0
def add_redshift(
    name: str = typer.Option(..., help="A memorable name for the database"),
    username: str = typer.Option(..., help="Username or role to connect database"),
    password: str = typer.Option(..., help="Password of username or role"),
    database: str = typer.Option(..., help="Database name"),
    uri: str = typer.Option(..., help="Hostname or URI of the database"),
    port: Optional[int] = typer.Option(None, help="Port number of the database"),
):
    catalog = open_catalog(
        app_dir=dbcat.settings.APP_DIR,
        secret=dbcat.settings.CATALOG_SECRET,
        path=dbcat.settings.CATALOG_PATH,
        host=dbcat.settings.CATALOG_HOST,
        port=dbcat.settings.CATALOG_PORT,
        user=dbcat.settings.CATALOG_USER,
        password=dbcat.settings.CATALOG_PASSWORD,
        database=dbcat.settings.CATALOG_DB,
    )
    with closing(catalog):
        with catalog.managed_session:
            add_redshift_source(
                catalog=catalog,
                name=name,
                username=username,
                password=password,
                database=database,
                uri=uri,
                port=port,
            )
        typer.echo("Registered Redshift database {}".format(name))
Пример #6
0
def add_postgresql(
    name: str = typer.Option(..., help="A memorable name for the database"),
    username: str = typer.Option(...,
                                 help="Username or role to connect database"),
    password: str = typer.Option(..., help="Password of username or role"),
    database: str = typer.Option(..., help="Database name"),
    uri: str = typer.Option(..., help="Hostname or URI of the database"),
    port: Optional[int] = typer.Option(None,
                                       help="Port number of the database"),
):
    catalog = open_catalog(**app_state["catalog_connection"])
    with closing(catalog):
        init_db(catalog)

        with catalog.managed_session:
            catalog.add_source(
                name=name,
                source_type="postgresql",
                username=username,
                password=password,
                database=database,
                uri=uri,
                port=port,
            )
        typer.echo("Registered Postgres database {}".format(name))
Пример #7
0
def add_snowflake(
        name: str = typer.Option(...,
                                 help="A memorable name for the database"),
        username: str = typer.Option(
            ..., help="Username or role to connect database"),
        password: str = typer.Option(..., help="Password of username or role"),
        database: str = typer.Option(..., help="Database name"),
        account: str = typer.Option(..., help="Snowflake Account Name"),
        warehouse: str = typer.Option(..., help="Snowflake Warehouse Name"),
        role: str = typer.Option(..., help="Snowflake Role Name"),
):
    catalog = open_catalog(**app_state["catalog_connection"])
    with closing(catalog):
        init_db(catalog)

        with catalog.managed_session:
            catalog.add_source(
                name=name,
                source_type="snowflake",
                username=username,
                password=password,
                database=database,
                account=account,
                warehouse=warehouse,
                role=role,
            )
        typer.echo("Registered Snowflake database {}".format(name))
Пример #8
0
def scan(
    source_name: Optional[List[str]] = typer.Option(
        None, help="List of names of database and data warehouses"),
    include_schema: Optional[List[str]] = typer.Option(None,
                                                       help=schema_help_text),
    exclude_schema: Optional[List[str]] = typer.Option(
        None, help=exclude_schema_help_text),
    include_table: Optional[List[str]] = typer.Option(None,
                                                      help=table_help_text),
    exclude_table: Optional[List[str]] = typer.Option(
        None, help=exclude_table_help_text),
):
    catalog = open_catalog(**app_state["catalog_connection"])
    with closing(catalog):
        init_db(catalog)
        try:
            scan_sources(
                catalog=catalog,
                source_names=source_name,
                include_schema_regex=include_schema,
                exclude_schema_regex=exclude_schema,
                include_table_regex=include_table,
                exclude_table_regex=exclude_table,
            )
        except NoMatchesError:
            typer.echo(
                "No schema or tables scanned. Ensure include/exclude patterns are correct "
                "and database has tables")
Пример #9
0
def add_sqlite(
        name: str = typer.Option(...,
                                 help="A memorable name for the database"),
        path: Path = typer.Option(..., help="File path to SQLite database"),
):
    catalog = open_catalog(**app_state["catalog_connection"])
    with closing(catalog):
        init_db(catalog)

        with catalog.managed_session:
            catalog.add_source(name=name, source_type="sqlite", path=path)
        typer.echo("Registered SQLite database {}".format(name))
Пример #10
0
    def _load_catalog(self) -> Generator[TableMetadata, None, None]:
        """
        Create an iterator.
        """
        LOGGER.debug(self.catalog_config.as_plain_ordered_dict())

        catalog = open_catalog(
            app_dir=Path(typer.get_app_dir("tokern")),
            **self.catalog_config.as_plain_ordered_dict()
        )
        with closing(catalog) as catalog:
            with catalog.managed_session:
                if self.source_names is not None and len(self.source_names) > 0:
                    sources = [
                        catalog.get_source(source_name)
                        for source_name in self.source_names
                    ]
                else:
                    sources = catalog.get_sources()

                for source in sources:
                    for schema, table in table_generator(
                        catalog=catalog,
                        source=source,
                        include_schema_regex_str=self.include_schema_regex,
                        exclude_schema_regex_str=self.exclude_schema_regex,
                        include_table_regex_str=self.include_table_regex,
                        exclude_table_regex_str=self.exclude_table_regex,
                    ):
                        columns = []
                        for column in catalog.get_columns_for_table(table):
                            badges = []
                            if column.pii_type is not None:
                                badges.append("pii")
                                badges.append(column.pii_type.name)
                            columns.append(
                                ColumnMetadata(
                                    name=column.name,
                                    description="",
                                    col_type=column.data_type,
                                    sort_order=column.sort_order,
                                    badges=badges,
                                )
                            )
                        yield TableMetadata(
                            database=source.database,
                            cluster=source.name,
                            schema=schema.name,
                            name=table.name,
                            columns=columns,
                            description="",
                        )
Пример #11
0
def add_sqlite(
    name: str = typer.Option(..., help="A memorable name for the database"),
    path: Path = typer.Option(..., help="File path to SQLite database"),
):
    catalog = open_catalog(
        app_dir=dbcat.settings.APP_DIR,
        secret=dbcat.settings.CATALOG_SECRET,
        path=dbcat.settings.CATALOG_PATH,
        host=dbcat.settings.CATALOG_HOST,
        port=dbcat.settings.CATALOG_PORT,
        user=dbcat.settings.CATALOG_USER,
        password=dbcat.settings.CATALOG_PASSWORD,
        database=dbcat.settings.CATALOG_DB,
    )
    with closing(catalog):
        with catalog.managed_session:
            add_sqlite_source(catalog=catalog, name=name, path=path)
        typer.echo("Registered SQLite database {}".format(name))
Пример #12
0
def add_athena(
        name: str = typer.Option(...,
                                 help="A memorable name for the database"),
        aws_access_key_id: str = typer.Option(..., help="AWS Access Key"),
        aws_secret_access_key: str = typer.Option(..., help="AWS Secret Key"),
        region_name: str = typer.Option(..., help="AWS Region Name"),
        s3_staging_dir: str = typer.Option(..., help="S3 Staging Dir"),
):
    catalog = open_catalog(**app_state["catalog_connection"])
    with closing(catalog):
        init_db(catalog)

        with catalog.managed_session:
            catalog.add_source(
                name=name,
                source_type="snowflake",
                aws_access_key_id=aws_access_key_id,
                aws_secret_access_key=aws_secret_access_key,
                region_name=region_name,
                s3_staging_dir=s3_staging_dir,
            )
        typer.echo("Registered AWS Athena {}".format(name))
Пример #13
0
def scan(
    source_name: Optional[List[str]] = typer.Option(
        None, help="List of names of database and data warehouses"
    ),
    include_schema: Optional[List[str]] = typer.Option(None, help=schema_help_text),
    exclude_schema: Optional[List[str]] = typer.Option(
        None, help=exclude_schema_help_text
    ),
    include_table: Optional[List[str]] = typer.Option(None, help=table_help_text),
    exclude_table: Optional[List[str]] = typer.Option(
        None, help=exclude_table_help_text
    ),
):
    catalog = open_catalog(
        app_dir=dbcat.settings.APP_DIR,
        secret=dbcat.settings.CATALOG_SECRET,
        path=dbcat.settings.CATALOG_PATH,
        host=dbcat.settings.CATALOG_HOST,
        port=dbcat.settings.CATALOG_PORT,
        user=dbcat.settings.CATALOG_USER,
        password=dbcat.settings.CATALOG_PASSWORD,
        database=dbcat.settings.CATALOG_DB,
    )
    with closing(catalog):
        init_db(catalog)
        try:
            scan_sources(
                catalog=catalog,
                source_names=source_name,
                include_schema_regex=include_schema,
                exclude_schema_regex=exclude_schema,
                include_table_regex=include_table,
                exclude_table_regex=exclude_table,
            )
        except NoMatchesError:
            typer.echo(
                "No schema or tables scanned. Ensure include/exclude patterns are correct "
                "and database has tables"
            )
Пример #14
0
def test_default_catalog(tmpdir):
    catalog = open_catalog(app_dir=tmpdir, secret=settings.DEFAULT_CATALOG_SECRET)
    default_catalog = tmpdir / "catalog.db"
    assert isinstance(catalog, SqliteCatalog)
    init_db(catalog)
    assert default_catalog.exists()
Пример #15
0
def detect(
    source_name: str = typer.Option(..., help="Name of database to scan."),
    scan_type: ScanTypeEnum = typer.Option(
        ScanTypeEnum.metadata,
        help="Choose deep(scan data) or shallow(scan column names only)",
    ),
    incremental: bool = typer.Option(
        True,
        help="Scan columns updated or created since last run",
    ),
    list_all: bool = typer.Option(
        False,
        help=
        "List all columns. By default only columns with PII information is listed",
    ),
    include_schema: Optional[List[str]] = typer.Option(None,
                                                       help=schema_help_text),
    exclude_schema: Optional[List[str]] = typer.Option(
        None, help=exclude_schema_help_text),
    include_table: Optional[List[str]] = typer.Option(None,
                                                      help=table_help_text),
    exclude_table: Optional[List[str]] = typer.Option(
        None, help=exclude_table_help_text),
    sample_size: int = typer.Option(
        SMALL_TABLE_MAX,
        help="Sample size for large tables when running deep scan."),
):
    catalog = open_catalog(
        app_dir=dbcat.settings.APP_DIR,
        secret=dbcat.settings.CATALOG_SECRET,
        path=dbcat.settings.CATALOG_PATH,
        host=dbcat.settings.CATALOG_HOST,
        port=dbcat.settings.CATALOG_PORT,
        user=dbcat.settings.CATALOG_USER,
        password=dbcat.settings.CATALOG_PASSWORD,
        database=dbcat.settings.CATALOG_DB,
    )

    with closing(catalog) as catalog:
        init_db(catalog)
        with catalog.managed_session:
            source = catalog.get_source(source_name)
            try:
                op = scan_database(
                    catalog=catalog,
                    source=source,
                    scan_type=scan_type,
                    incremental=incremental,
                    output_format=dbcat.settings.OUTPUT_FORMAT,
                    list_all=list_all,
                    include_schema_regex=include_schema,
                    exclude_schema_regex=exclude_schema,
                    include_table_regex=include_table,
                    exclude_table_regex=exclude_table,
                    sample_size=sample_size,
                )
                typer.echo(
                    message=str_output(op, dbcat.settings.OUTPUT_FORMAT))
            except NoMatchesError:
                typer.echo(message=NoMatchesError.message)
                typer.Exit(1)
Пример #16
0
def test_default_catalog(tmpdir):
    catalog = open_catalog(app_dir=tmpdir)
    default_catalog = tmpdir / "catalog.db"
    assert isinstance(catalog, SqliteCatalog)
    init_db(catalog)
    assert default_catalog.exists()
Пример #17
0
    def get_workunits(self) -> Iterable[WorkUnit]:
        catalog = open_catalog(
            app_dir=Path(typer.get_app_dir("tokern")),
            secret=self.config.secret,
            path=Path(self.config.path)
            if self.config.path is not None else None,
            user=self.config.user,
            password=self.config.password,
            host=self.config.host,
            port=self.config.port,
            database=self.config.database,
        )

        with closing(catalog) as catalog:
            with catalog.managed_session:
                if (self.config.source_names is not None
                        and len(self.config.source_names) > 0):
                    sources = [
                        catalog.get_source(source_name)
                        for source_name in self.config.source_names
                    ]
                else:
                    sources = catalog.get_sources()

                for source in sources:
                    for schema, table in table_generator(
                            catalog=catalog,
                            source=source,
                            include_schema_regex_str=self.config.
                            include_schema_regex,
                            exclude_schema_regex_str=self.config.
                            exclude_schema_regex,
                            include_table_regex_str=self.config.
                            include_table_regex,
                            exclude_table_regex_str=self.config.
                            exclude_table_regex,
                    ):
                        if self.config.include_source_name:
                            dataset_name = f"{source.name}.{schema.name}.{table.name}"
                        else:
                            dataset_name = f"{schema.name}.{table.name}"
                        self.report.report_entity_scanned(dataset_name)

                        dataset_urn = f"urn:li:dataset:(urn:li:dataPlatform:{source.source_type},{dataset_name},{self.config.env})"
                        dataset_snapshot = DatasetSnapshot(
                            urn=dataset_urn,
                            aspects=[],
                        )

                        schema_fields = []
                        for column in catalog.get_columns_for_table(table):
                            global_tags: Optional[GlobalTagsClass] = None
                            if column.pii_type is not None:
                                global_tags = GlobalTagsClass(tags=[
                                    TagAssociationClass("urn:li:tag:pii"),
                                    TagAssociationClass(
                                        f"urn:li:tag:{column.pii_type.name.lower()}"
                                    ),
                                ])

                            schema_fields.append(
                                SchemaField(
                                    fieldPath=column.name,
                                    type=CatalogSource.get_column_type(
                                        column.data_type),
                                    nativeDataType=column.data_type,
                                    description=None,
                                    nullable=True,
                                    recursive=False,
                                    globalTags=global_tags,
                                ))

                        schema_metadata = get_schema_metadata(
                            sql_report=self.report,
                            dataset_name=dataset_name,
                            platform=source.source_type,
                            columns=[],
                            canonical_schema=schema_fields,
                        )
                        dataset_snapshot.aspects.append(schema_metadata)

                        mce = MetadataChangeEvent(
                            proposedSnapshot=dataset_snapshot)
                        wu = SqlWorkUnit(id=dataset_name, mce=mce)
                        self.report.report_workunit(wu)
                        yield wu