def test_data_generator(sqlalchemy_engine): catalog, source, conn = sqlalchemy_engine count = 0 for tpl in data_generator(catalog=catalog, source=source): count += 1 assert count == 14
def test_data_generator_include_int_table(load_source): catalog, source = load_source count = 0 for tpl in data_generator(catalog=catalog, source=source, include_table_regex_str=["partial_data_type"]): count += 1 assert count == 2
def test_data_generator_exclude_table(load_source): catalog, source = load_source count = 0 for tpl in data_generator(catalog=catalog, source=source, exclude_table_regex_str=["full.*"]): count += 1 assert count == 10
def test_data_generator_exclude_schema(load_source): catalog, source = load_source schemata = catalog.search_schema(source_like=source.name, schema_like="%") count = 0 for tpl in data_generator(catalog=catalog, source=source, exclude_schema_regex_str=[schemata[0].name]): count += 1 assert count == 0
def test_incremental_data_generator(setup_incremental): catalog, source_id = setup_incremental with catalog.managed_session: source = catalog.get_source_by_id(source_id) tasks = catalog.get_tasks_by_app_name("piicatcher.{}".format(source.name)) count = 0 for tpl in data_generator(catalog=catalog, source=source): count += 1 assert count == 434 count = 0 for tpl in data_generator( catalog=catalog, source=source, last_run=tasks[0].updated_at ): count += 1 assert count == 14
def test_deep_scan(load_data_and_pull): catalog, source_id = load_data_and_pull with catalog.managed_session: source = catalog.get_source_by_id(source_id) data_scan( catalog=catalog, detectors=[DatumRegexDetector()], work_generator=column_generator(catalog=catalog, source=source), generator=data_generator(catalog=catalog, source=source), ) schemata = catalog.search_schema(source_like=source.name, schema_like="%") state = catalog.get_column( source_name=source.name, schema_name=schemata[0].name, table_name="partial_pii", column_name="a", ) assert state.pii_type == Phone()
def scan_database( catalog: Catalog, source: CatSource, scan_type: ScanTypeEnum = ScanTypeEnum.metadata, incremental: bool = True, output_format: OutputFormat = OutputFormat.tabular, list_all: bool = False, include_schema_regex: List[str] = None, exclude_schema_regex: List[str] = None, include_table_regex: List[str] = None, exclude_table_regex: List[str] = None, sample_size: int = SMALL_TABLE_MAX, ) -> Union[List[Any], Dict[Any, Any]]: message = "Source: {source_name}, scan_type: {scan_type}, include_schema: {include_schema}, \ exclude_schema: {exclude_schema}, include_table: {include_table}, exclude_schema: {exclude_table}".format( source_name=source.name, scan_type=str(scan_type), include_schema=",".join(include_schema_regex) if include_schema_regex is not None else "None", exclude_schema=",".join(exclude_schema_regex) if exclude_schema_regex is not None else "None", include_table=",".join(include_table_regex) if include_table_regex is not None else "None", exclude_table=",".join(exclude_table_regex) if exclude_table_regex is not None else "None", ) status_message = "Success" exit_code = 0 with catalog.managed_session: scan_sources( catalog=catalog, source_names=[source.name], include_schema_regex=include_schema_regex, exclude_schema_regex=exclude_schema_regex, include_table_regex=include_table_regex, exclude_table_regex=exclude_table_regex, ) last_run: Optional[datetime.datetime] = None if incremental: last_task = catalog.get_latest_task("piicatcher.{}".format( source.name)) last_run = last_task.updated_at if last_task is not None else None if last_run is not None: LOGGER.debug("Last Run at {}", last_run) else: LOGGER.debug("No last run found") try: scan_sources( catalog=catalog, source_names=[source.name], include_schema_regex=include_schema_regex, exclude_schema_regex=exclude_schema_regex, include_table_regex=include_table_regex, exclude_table_regex=exclude_table_regex, ) if scan_type == ScanTypeEnum.metadata: detector_list = [ detector() for detector in detectors.detector_registry.get_all().values() if issubclass(detector, MetadataDetector) ] metadata_scan( catalog=catalog, detectors=detector_list, work_generator=column_generator( catalog=catalog, source=source, last_run=last_run, exclude_schema_regex_str=exclude_schema_regex, include_schema_regex_str=include_schema_regex, exclude_table_regex_str=exclude_table_regex, include_table_regex_str=include_table_regex, ), generator=column_generator( catalog=catalog, source=source, last_run=last_run, exclude_schema_regex_str=exclude_schema_regex, include_schema_regex_str=include_schema_regex, exclude_table_regex_str=exclude_table_regex, include_table_regex_str=include_table_regex, ), ) else: detector_list = [ detector() for detector in detectors.detector_registry.get_all().values() if issubclass(detector, DatumDetector) ] data_scan( catalog=catalog, detectors=detector_list, work_generator=column_generator( catalog=catalog, source=source, last_run=last_run, exclude_schema_regex_str=exclude_schema_regex, include_schema_regex_str=include_schema_regex, exclude_table_regex_str=exclude_table_regex, include_table_regex_str=include_table_regex, ), generator=data_generator( catalog=catalog, source=source, last_run=last_run, exclude_schema_regex_str=exclude_schema_regex, include_schema_regex_str=include_schema_regex, exclude_table_regex_str=exclude_table_regex, include_table_regex_str=include_table_regex, sample_size=sample_size, ), sample_size=sample_size, ) if output_format == OutputFormat.tabular: return output_tabular(catalog=catalog, source=source, list_all=list_all, last_run=last_run) else: return output_dict(catalog=catalog, source=source, list_all=list_all, last_run=last_run) except Exception as e: status_message = str(e) exit_code = 1 raise e finally: catalog.add_task( "piicatcher.{}".format(source.name), exit_code, "{}.{}".format(message, status_message), )