def __init__( self, name: str, description: Union[str, None], col_type: str, sort_order: int, badges: Union[List[str], None] = None, ) -> None: """ TODO: Add stats :param name: :param description: :param col_type: :param sort_order: :param badges: Optional. Column level badges """ self.name = name self.description = DescriptionMetadata.create_description_metadata( source=None, text=description) self.type = col_type self.sort_order = sort_order formatted_badges = _format_as_list(badges) self.badges = [Badge(badge, 'column') for badge in formatted_badges] # The following fields are populated by the ComplexTypeTransformer self._column_key: Optional[str] = None self._type_metadata: Optional[TypeMetadata] = None
def _load_csv(self) -> None: with open(self.badge_file_location, 'r') as fin: self.badges = [dict(i) for i in csv.DictReader(fin)] # print("BADGES: " + str(self.badges)) parsed_badges = defaultdict(list) for badge_dict in self.badges: db = badge_dict['database'] cluster = badge_dict['cluster'] schema = badge_dict['schema'] table_name = badge_dict['table_name'] id = self._get_key(db, cluster, schema, table_name) badge = Badge(name=badge_dict['name'], category=badge_dict['category']) parsed_badges[id].append(badge) with open(self.table_file_location, 'r') as fin: tables = [dict(i) for i in csv.DictReader(fin)] results = [] for table_dict in tables: db = table_dict['database'] cluster = table_dict['cluster'] schema = table_dict['schema'] table_name = table_dict['name'] id = self._get_key(db, cluster, schema, table_name) badges = parsed_badges[id] if badges is None: badges = [] badge_metadata = BadgeMetadata(start_label=TableMetadata.TABLE_NODE_LABEL, start_key=id, badges=badges) results.append(badge_metadata) self._iter = iter(results)
def test_extraction_of_tablecolumn_badges(self) -> None: """ Test Extraction using the combined CsvTableModel model class """ config_dict = { f'extractor.csvtablecolumn.{CsvTableColumnExtractor.TABLE_FILE_LOCATION}': 'example/sample_data/sample_table.csv', f'extractor.csvtablecolumn.{CsvTableColumnExtractor.COLUMN_FILE_LOCATION}': 'example/sample_data/sample_col.csv', } self.conf = ConfigFactory.from_dict(config_dict) extractor = CsvTableColumnExtractor() extractor.init(Scoped.get_scoped_conf(conf=self.conf, scope=extractor.get_scope())) result = extractor.extract() self.assertEqual(result.name, 'test_table1') self.assertEqual(result.columns[0].badges, [Badge('pk', 'column')]) self.assertEqual(result.columns[1].badges, [Badge('pii', 'column')])
def __init__(self, name: str, description: Union[str, None], col_type: str, sort_order: int, badges: Union[List[str], None] = None, ) -> None: """ TODO: Add stats :param name: :param description: :param col_type: :param sort_order: :param badges: Optional. Column level badges """ self.name = name self.description = DescriptionMetadata.create_description_metadata(source=None, text=description) self.type = col_type self.sort_order = sort_order formatted_badges = _format_as_list(badges) self.badges = [Badge(badge, 'column') for badge in formatted_badges]
def __init__(self, name: str, description: Union[str, None], col_type: str, sort_order: int, badges: Union[List[str], None] = None ) -> None: """ TODO: Add stats :param name: :param description: :param col_type: :param sort_order: """ self.name = name self.description = DescriptionMetadata.create_description_metadata(source=None, text=description) self.type = col_type self.sort_order = sort_order if badges: self.badges = [Badge(badge, 'column') for badge in badges] else: self.badges = []
from databuilder.serializers.neptune_serializer import ( NEPTUNE_CREATION_TYPE_JOB, NEPTUNE_CREATION_TYPE_NODE_PROPERTY_NAME_BULK_LOADER_FORMAT, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT, NEPTUNE_HEADER_ID, NEPTUNE_HEADER_LABEL, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT, NEPTUNE_RELATIONSHIP_HEADER_FROM, NEPTUNE_RELATIONSHIP_HEADER_TO, ) db = 'hive' SCHEMA = 'BASE' TABLE = 'TEST' CLUSTER = 'DEFAULT' badge1 = Badge('badge1', 'column') badge2 = Badge('badge2', 'column') class TestBadge(unittest.TestCase): def setUp(self) -> None: super(TestBadge, self).setUp() self.badge_metada = BadgeMetadata( start_label='Column', start_key='hive://default.base/test/ds', badges=[badge1, badge2]) def test_get_badge_key(self) -> None: badge_key = self.badge_metada.get_badge_key(badge1.name) self.assertEqual(badge_key, badge1.name)
def _get_extract_iter(self) -> Iterator[Union[TableMetadata, BadgeMetadata, TableSource, TableLineage]]: """ Generates the extract iterator for all of the model types created by the dbt files. """ dbt_id_to_table_key = {} for tbl_node, manifest_content in self._dbt_manifest['nodes'].items(): if manifest_content['resource_type'] == DBT_MODEL_TYPE and tbl_node in self._dbt_catalog['nodes']: LOGGER.info( 'Extracting dbt {}.{}'.format(manifest_content['schema'], manifest_content[self._model_name_key]) ) catalog_content = self._dbt_catalog['nodes'][tbl_node] tbl_columns: List[ColumnMetadata] = self._get_column_values( manifest_columns=manifest_content['columns'], catalog_columns=catalog_content['columns'] ) desc, desc_src = self._get_table_descriptions(manifest_content) tags, tbl_badges = self._get_table_tags_badges(manifest_content) tbl_metadata = TableMetadata( database=self._default_sanitize(self._database_name), # The dbt "database" is the cluster here cluster=self._default_sanitize(manifest_content['database']), schema=self._default_sanitize(manifest_content['schema']), name=self._default_sanitize(manifest_content[self._model_name_key]), is_view=catalog_content['metadata']['type'] == 'view', columns=tbl_columns, tags=tags, description=desc, description_source=desc_src ) # Keep track for Lineage dbt_id_to_table_key[tbl_node] = tbl_metadata._get_table_key() # Optionally filter schemas in the output yield_schema = self._can_yield_schema(manifest_content['schema']) if self._extract_tables and yield_schema: yield tbl_metadata if self._extract_tags and tbl_badges and yield_schema: yield BadgeMetadata(start_label=TableMetadata.TABLE_NODE_LABEL, start_key=tbl_metadata._get_table_key(), badges=[Badge(badge, 'table') for badge in tbl_badges]) if self._source_url and yield_schema: yield TableSource(db_name=tbl_metadata.database, cluster=tbl_metadata.cluster, schema=tbl_metadata.schema, table_name=tbl_metadata.name, source=os.path.join(self._source_url, manifest_content.get('original_file_path'))) if self._extract_lineage: for upstream, downstreams in self._dbt_manifest['child_map'].items(): if upstream not in dbt_id_to_table_key: continue valid_downstreams = [ dbt_id_to_table_key[k] for k in downstreams if k.startswith(DBT_MODEL_PREFIX) and dbt_id_to_table_key.get(k) ] if valid_downstreams: yield TableLineage( table_key=dbt_id_to_table_key[upstream], downstream_deps=valid_downstreams )
def test_badge_name_category_are_lower_cases(self) -> None: uppercase_badge = Badge('BadGe3', 'COLUMN_3') self.assertEqual(uppercase_badge.name, 'badge3') self.assertEqual(uppercase_badge.category, 'column_3')
def test_extraction_with_model_class(self) -> None: """ Test Extraction using model class """ config_dict = { f'extractor.dbt.{DbtExtractor.DATABASE_NAME}': self.database_name, f'extractor.dbt.{DbtExtractor.CATALOG_JSON}': self.catalog_file_loc, f'extractor.dbt.{DbtExtractor.MANIFEST_JSON}': self.manifest_data, f'extractor.dbt.{DbtExtractor.SOURCE_URL}': self.source_url } self.conf = ConfigFactory.from_dict(config_dict) extractor = DbtExtractor() extractor.init( Scoped.get_scoped_conf(conf=self.conf, scope=extractor.get_scope())) # One block of tests for each type of model created extracted_classes = [] result = extractor.extract() self.assertTrue(isinstance(result, TableMetadata)) self.assertEqual(result.name, 'fact_third_party_performance') self.assertEqual( result.description.text, 'the performance for third party vendors loss rate by day.') self.assertEqual(result.database, self.database_name) self.assertEqual(result.cluster, 'dbt_demo') self.assertEqual(result.schema, 'public') self.assertEqual(result.tags, []) self.assertEqual(result.is_view, True) extracted_classes.append(TableMetadata) result2 = _extract_until_not_these(extractor, extracted_classes) self.assertTrue(isinstance(result2, TableSource)) self.assertEqual(result2.db, self.database_name) self.assertEqual(result2.cluster, 'dbt_demo') self.assertEqual(result2.schema, 'public') self.assertEqual(result2.table, 'fact_third_party_performance') self.assertEqual( result2.source, 'test_url/models/call_center/fact_third_party_performance.sql') extracted_classes.append(TableSource) result3 = _extract_until_not_these(extractor, extracted_classes) self.assertTrue(isinstance(result3, BadgeMetadata)) self.assertEqual( result3.badges, [Badge('finance', 'table'), Badge('certified', 'table')]) extracted_classes.append(BadgeMetadata) result4 = _extract_until_not_these(extractor, extracted_classes) self.assertTrue(isinstance(result4, TableLineage)) self.assertEqual(result4.table_key, 'snowflake://dbt_demo.public/fact_catalog_returns') self.assertEqual( result4.downstream_deps, ['snowflake://dbt_demo.public/fact_third_party_performance']) extracted_classes.append(TableLineage) # Should not be any other unique models created result5 = _extract_until_not_these(extractor, extracted_classes) self.assertEqual(result5, None)