def test_catalog_config_file(mocker, tmpdir): config_file = tmpdir / "catalog.yml" with config_file.open("w") as f: f.write("test_catalog_config") mocker.patch("dbcat.api.catalog_connection_yaml") mocker.patch("dbcat.api.init_db") open_catalog(app_dir=tmpdir, secret=settings.DEFAULT_CATALOG_SECRET) dbcat.api.catalog_connection_yaml.assert_called_once_with("test_catalog_config")
def test_catalog_config_file(mocker, tmpdir): config_file = tmpdir / "catalog.yml" with config_file.open("w") as f: f.write("test_catalog_config") mocker.patch("dbcat.api.catalog_connection_yaml") open_catalog(app_dir=tmpdir) dbcat.api.catalog_connection_yaml.assert_called_once_with( "test_catalog_config")
def add_athena( name: str = typer.Option(..., help="A memorable name for the database"), aws_access_key_id: str = typer.Option(..., help="AWS Access Key"), aws_secret_access_key: str = typer.Option(..., help="AWS Secret Key"), region_name: str = typer.Option(..., help="AWS Region Name"), s3_staging_dir: str = typer.Option(..., help="S3 Staging Dir"), ): catalog = open_catalog( app_dir=dbcat.settings.APP_DIR, secret=dbcat.settings.CATALOG_SECRET, path=dbcat.settings.CATALOG_PATH, host=dbcat.settings.CATALOG_HOST, port=dbcat.settings.CATALOG_PORT, user=dbcat.settings.CATALOG_USER, password=dbcat.settings.CATALOG_PASSWORD, database=dbcat.settings.CATALOG_DB, ) with closing(catalog): with catalog.managed_session: add_athena_source( catalog=catalog, name=name, aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, region_name=region_name, s3_staging_dir=s3_staging_dir, ) typer.echo("Registered AWS Athena {}".format(name))
def add_snowflake( name: str = typer.Option(..., help="A memorable name for the database"), username: str = typer.Option(..., help="Username or role to connect database"), password: str = typer.Option(..., help="Password of username or role"), database: str = typer.Option(..., help="Database name"), account: str = typer.Option(..., help="Snowflake Account Name"), warehouse: str = typer.Option(..., help="Snowflake Warehouse Name"), role: str = typer.Option(..., help="Snowflake Role Name"), ): catalog = open_catalog( app_dir=dbcat.settings.APP_DIR, secret=dbcat.settings.CATALOG_SECRET, path=dbcat.settings.CATALOG_PATH, host=dbcat.settings.CATALOG_HOST, port=dbcat.settings.CATALOG_PORT, user=dbcat.settings.CATALOG_USER, password=dbcat.settings.CATALOG_PASSWORD, database=dbcat.settings.CATALOG_DB, ) with closing(catalog): with catalog.managed_session: add_snowflake_source( catalog=catalog, name=name, username=username, password=password, database=database, account=account, warehouse=warehouse, role=role, ) typer.echo("Registered Snowflake database {}".format(name))
def add_redshift( name: str = typer.Option(..., help="A memorable name for the database"), username: str = typer.Option(..., help="Username or role to connect database"), password: str = typer.Option(..., help="Password of username or role"), database: str = typer.Option(..., help="Database name"), uri: str = typer.Option(..., help="Hostname or URI of the database"), port: Optional[int] = typer.Option(None, help="Port number of the database"), ): catalog = open_catalog( app_dir=dbcat.settings.APP_DIR, secret=dbcat.settings.CATALOG_SECRET, path=dbcat.settings.CATALOG_PATH, host=dbcat.settings.CATALOG_HOST, port=dbcat.settings.CATALOG_PORT, user=dbcat.settings.CATALOG_USER, password=dbcat.settings.CATALOG_PASSWORD, database=dbcat.settings.CATALOG_DB, ) with closing(catalog): with catalog.managed_session: add_redshift_source( catalog=catalog, name=name, username=username, password=password, database=database, uri=uri, port=port, ) typer.echo("Registered Redshift database {}".format(name))
def add_postgresql( name: str = typer.Option(..., help="A memorable name for the database"), username: str = typer.Option(..., help="Username or role to connect database"), password: str = typer.Option(..., help="Password of username or role"), database: str = typer.Option(..., help="Database name"), uri: str = typer.Option(..., help="Hostname or URI of the database"), port: Optional[int] = typer.Option(None, help="Port number of the database"), ): catalog = open_catalog(**app_state["catalog_connection"]) with closing(catalog): init_db(catalog) with catalog.managed_session: catalog.add_source( name=name, source_type="postgresql", username=username, password=password, database=database, uri=uri, port=port, ) typer.echo("Registered Postgres database {}".format(name))
def add_snowflake( name: str = typer.Option(..., help="A memorable name for the database"), username: str = typer.Option( ..., help="Username or role to connect database"), password: str = typer.Option(..., help="Password of username or role"), database: str = typer.Option(..., help="Database name"), account: str = typer.Option(..., help="Snowflake Account Name"), warehouse: str = typer.Option(..., help="Snowflake Warehouse Name"), role: str = typer.Option(..., help="Snowflake Role Name"), ): catalog = open_catalog(**app_state["catalog_connection"]) with closing(catalog): init_db(catalog) with catalog.managed_session: catalog.add_source( name=name, source_type="snowflake", username=username, password=password, database=database, account=account, warehouse=warehouse, role=role, ) typer.echo("Registered Snowflake database {}".format(name))
def scan( source_name: Optional[List[str]] = typer.Option( None, help="List of names of database and data warehouses"), include_schema: Optional[List[str]] = typer.Option(None, help=schema_help_text), exclude_schema: Optional[List[str]] = typer.Option( None, help=exclude_schema_help_text), include_table: Optional[List[str]] = typer.Option(None, help=table_help_text), exclude_table: Optional[List[str]] = typer.Option( None, help=exclude_table_help_text), ): catalog = open_catalog(**app_state["catalog_connection"]) with closing(catalog): init_db(catalog) try: scan_sources( catalog=catalog, source_names=source_name, include_schema_regex=include_schema, exclude_schema_regex=exclude_schema, include_table_regex=include_table, exclude_table_regex=exclude_table, ) except NoMatchesError: typer.echo( "No schema or tables scanned. Ensure include/exclude patterns are correct " "and database has tables")
def add_sqlite( name: str = typer.Option(..., help="A memorable name for the database"), path: Path = typer.Option(..., help="File path to SQLite database"), ): catalog = open_catalog(**app_state["catalog_connection"]) with closing(catalog): init_db(catalog) with catalog.managed_session: catalog.add_source(name=name, source_type="sqlite", path=path) typer.echo("Registered SQLite database {}".format(name))
def _load_catalog(self) -> Generator[TableMetadata, None, None]: """ Create an iterator. """ LOGGER.debug(self.catalog_config.as_plain_ordered_dict()) catalog = open_catalog( app_dir=Path(typer.get_app_dir("tokern")), **self.catalog_config.as_plain_ordered_dict() ) with closing(catalog) as catalog: with catalog.managed_session: if self.source_names is not None and len(self.source_names) > 0: sources = [ catalog.get_source(source_name) for source_name in self.source_names ] else: sources = catalog.get_sources() for source in sources: for schema, table in table_generator( catalog=catalog, source=source, include_schema_regex_str=self.include_schema_regex, exclude_schema_regex_str=self.exclude_schema_regex, include_table_regex_str=self.include_table_regex, exclude_table_regex_str=self.exclude_table_regex, ): columns = [] for column in catalog.get_columns_for_table(table): badges = [] if column.pii_type is not None: badges.append("pii") badges.append(column.pii_type.name) columns.append( ColumnMetadata( name=column.name, description="", col_type=column.data_type, sort_order=column.sort_order, badges=badges, ) ) yield TableMetadata( database=source.database, cluster=source.name, schema=schema.name, name=table.name, columns=columns, description="", )
def add_sqlite( name: str = typer.Option(..., help="A memorable name for the database"), path: Path = typer.Option(..., help="File path to SQLite database"), ): catalog = open_catalog( app_dir=dbcat.settings.APP_DIR, secret=dbcat.settings.CATALOG_SECRET, path=dbcat.settings.CATALOG_PATH, host=dbcat.settings.CATALOG_HOST, port=dbcat.settings.CATALOG_PORT, user=dbcat.settings.CATALOG_USER, password=dbcat.settings.CATALOG_PASSWORD, database=dbcat.settings.CATALOG_DB, ) with closing(catalog): with catalog.managed_session: add_sqlite_source(catalog=catalog, name=name, path=path) typer.echo("Registered SQLite database {}".format(name))
def add_athena( name: str = typer.Option(..., help="A memorable name for the database"), aws_access_key_id: str = typer.Option(..., help="AWS Access Key"), aws_secret_access_key: str = typer.Option(..., help="AWS Secret Key"), region_name: str = typer.Option(..., help="AWS Region Name"), s3_staging_dir: str = typer.Option(..., help="S3 Staging Dir"), ): catalog = open_catalog(**app_state["catalog_connection"]) with closing(catalog): init_db(catalog) with catalog.managed_session: catalog.add_source( name=name, source_type="snowflake", aws_access_key_id=aws_access_key_id, aws_secret_access_key=aws_secret_access_key, region_name=region_name, s3_staging_dir=s3_staging_dir, ) typer.echo("Registered AWS Athena {}".format(name))
def scan( source_name: Optional[List[str]] = typer.Option( None, help="List of names of database and data warehouses" ), include_schema: Optional[List[str]] = typer.Option(None, help=schema_help_text), exclude_schema: Optional[List[str]] = typer.Option( None, help=exclude_schema_help_text ), include_table: Optional[List[str]] = typer.Option(None, help=table_help_text), exclude_table: Optional[List[str]] = typer.Option( None, help=exclude_table_help_text ), ): catalog = open_catalog( app_dir=dbcat.settings.APP_DIR, secret=dbcat.settings.CATALOG_SECRET, path=dbcat.settings.CATALOG_PATH, host=dbcat.settings.CATALOG_HOST, port=dbcat.settings.CATALOG_PORT, user=dbcat.settings.CATALOG_USER, password=dbcat.settings.CATALOG_PASSWORD, database=dbcat.settings.CATALOG_DB, ) with closing(catalog): init_db(catalog) try: scan_sources( catalog=catalog, source_names=source_name, include_schema_regex=include_schema, exclude_schema_regex=exclude_schema, include_table_regex=include_table, exclude_table_regex=exclude_table, ) except NoMatchesError: typer.echo( "No schema or tables scanned. Ensure include/exclude patterns are correct " "and database has tables" )
def test_default_catalog(tmpdir): catalog = open_catalog(app_dir=tmpdir, secret=settings.DEFAULT_CATALOG_SECRET) default_catalog = tmpdir / "catalog.db" assert isinstance(catalog, SqliteCatalog) init_db(catalog) assert default_catalog.exists()
def detect( source_name: str = typer.Option(..., help="Name of database to scan."), scan_type: ScanTypeEnum = typer.Option( ScanTypeEnum.metadata, help="Choose deep(scan data) or shallow(scan column names only)", ), incremental: bool = typer.Option( True, help="Scan columns updated or created since last run", ), list_all: bool = typer.Option( False, help= "List all columns. By default only columns with PII information is listed", ), include_schema: Optional[List[str]] = typer.Option(None, help=schema_help_text), exclude_schema: Optional[List[str]] = typer.Option( None, help=exclude_schema_help_text), include_table: Optional[List[str]] = typer.Option(None, help=table_help_text), exclude_table: Optional[List[str]] = typer.Option( None, help=exclude_table_help_text), sample_size: int = typer.Option( SMALL_TABLE_MAX, help="Sample size for large tables when running deep scan."), ): catalog = open_catalog( app_dir=dbcat.settings.APP_DIR, secret=dbcat.settings.CATALOG_SECRET, path=dbcat.settings.CATALOG_PATH, host=dbcat.settings.CATALOG_HOST, port=dbcat.settings.CATALOG_PORT, user=dbcat.settings.CATALOG_USER, password=dbcat.settings.CATALOG_PASSWORD, database=dbcat.settings.CATALOG_DB, ) with closing(catalog) as catalog: init_db(catalog) with catalog.managed_session: source = catalog.get_source(source_name) try: op = scan_database( catalog=catalog, source=source, scan_type=scan_type, incremental=incremental, output_format=dbcat.settings.OUTPUT_FORMAT, list_all=list_all, include_schema_regex=include_schema, exclude_schema_regex=exclude_schema, include_table_regex=include_table, exclude_table_regex=exclude_table, sample_size=sample_size, ) typer.echo( message=str_output(op, dbcat.settings.OUTPUT_FORMAT)) except NoMatchesError: typer.echo(message=NoMatchesError.message) typer.Exit(1)
def test_default_catalog(tmpdir): catalog = open_catalog(app_dir=tmpdir) default_catalog = tmpdir / "catalog.db" assert isinstance(catalog, SqliteCatalog) init_db(catalog) assert default_catalog.exists()
def get_workunits(self) -> Iterable[WorkUnit]: catalog = open_catalog( app_dir=Path(typer.get_app_dir("tokern")), secret=self.config.secret, path=Path(self.config.path) if self.config.path is not None else None, user=self.config.user, password=self.config.password, host=self.config.host, port=self.config.port, database=self.config.database, ) with closing(catalog) as catalog: with catalog.managed_session: if (self.config.source_names is not None and len(self.config.source_names) > 0): sources = [ catalog.get_source(source_name) for source_name in self.config.source_names ] else: sources = catalog.get_sources() for source in sources: for schema, table in table_generator( catalog=catalog, source=source, include_schema_regex_str=self.config. include_schema_regex, exclude_schema_regex_str=self.config. exclude_schema_regex, include_table_regex_str=self.config. include_table_regex, exclude_table_regex_str=self.config. exclude_table_regex, ): if self.config.include_source_name: dataset_name = f"{source.name}.{schema.name}.{table.name}" else: dataset_name = f"{schema.name}.{table.name}" self.report.report_entity_scanned(dataset_name) dataset_urn = f"urn:li:dataset:(urn:li:dataPlatform:{source.source_type},{dataset_name},{self.config.env})" dataset_snapshot = DatasetSnapshot( urn=dataset_urn, aspects=[], ) schema_fields = [] for column in catalog.get_columns_for_table(table): global_tags: Optional[GlobalTagsClass] = None if column.pii_type is not None: global_tags = GlobalTagsClass(tags=[ TagAssociationClass("urn:li:tag:pii"), TagAssociationClass( f"urn:li:tag:{column.pii_type.name.lower()}" ), ]) schema_fields.append( SchemaField( fieldPath=column.name, type=CatalogSource.get_column_type( column.data_type), nativeDataType=column.data_type, description=None, nullable=True, recursive=False, globalTags=global_tags, )) schema_metadata = get_schema_metadata( sql_report=self.report, dataset_name=dataset_name, platform=source.source_type, columns=[], canonical_schema=schema_fields, ) dataset_snapshot.aspects.append(schema_metadata) mce = MetadataChangeEvent( proposedSnapshot=dataset_snapshot) wu = SqlWorkUnit(id=dataset_name, mce=mce) self.report.report_workunit(wu) yield wu