def setUp(self) -> None: super(TestTableLineage, self).setUp() self.table_lineage = TableLineage(db_name='hive', schema=SCHEMA, table_name=TABLE, cluster=CLUSTER, downstream_deps=['hive://default.test_schema/test_table1', 'hive://default.test_schema/test_table2'])
def setUp(self) -> None: super(TestTableLineage, self).setUp() self.table_lineage = TableLineage(table_key=f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}', downstream_deps=['hive://default.test_schema/test_table1', 'hive://default.test_schema/test_table2']) self.start_key = f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}' self.end_key1 = f'{DB}://{CLUSTER}.test_schema/test_table1' self.end_key2 = f'{DB}://{CLUSTER}.test_schema/test_table2'
class TestTableLineage(unittest.TestCase): def setUp(self): # type: () -> None super(TestTableLineage, self).setUp() self.table_lineage = TableLineage( db_name='hive', schema_name=SCHEMA, table_name=TABLE, cluster=CLUSTER, downstream_deps=[ 'hive://default.test_schema/test_table1', 'hive://default.test_schema/test_table2' ]) def test_get_table_model_key(self): # type: () -> None metadata = self.table_lineage.get_table_model_key(db=DB, cluster=CLUSTER, schema=SCHEMA, table=TABLE) self.assertEquals(metadata, 'hive://default.base/test') def test_create_nodes(self): # type: () -> None nodes = self.table_lineage.create_nodes() self.assertEquals(len(nodes), 0) def test_create_relation(self): # type: () -> None relations = self.table_lineage.create_relation() self.assertEquals(len(relations), 2) start_key = '{db}://{cluster}.{schema}/{tbl}'.format(db=DB, schema=SCHEMA, tbl=TABLE, cluster=CLUSTER) end_key1 = '{db}://{cluster}.{schema}/{tbl}'.format( db=DB, schema='test_schema', tbl='test_table1', cluster=CLUSTER) relation = { RELATION_START_KEY: start_key, RELATION_START_LABEL: 'Table', RELATION_END_KEY: end_key1, RELATION_END_LABEL: 'Table', RELATION_TYPE: TableLineage.ORIGIN_DEPENDENCY_RELATION_TYPE, RELATION_REVERSE_TYPE: TableLineage.DEPENDENCY_ORIGIN_RELATION_TYPE } self.assertTrue(len(relations), 2) self.assertTrue(relation in relations)
class TestTableLineage(unittest.TestCase): def setUp(self) -> None: super(TestTableLineage, self).setUp() self.table_lineage = TableLineage( db_name='hive', schema=SCHEMA, table_name=TABLE, cluster=CLUSTER, downstream_deps=[ 'hive://default.test_schema/test_table1', 'hive://default.test_schema/test_table2' ]) def test_get_table_model_key(self) -> None: metadata = self.table_lineage.get_table_model_key(db=DB, cluster=CLUSTER, schema=SCHEMA, table=TABLE) self.assertEqual(metadata, 'hive://default.base/test') def test_create_nodes(self) -> None: nodes = self.table_lineage.create_nodes() self.assertEqual(len(nodes), 0) def test_create_relation(self) -> None: relations = self.table_lineage.create_relation() self.assertEqual(len(relations), 2) start_key = f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}' end_key1 = f'{DB}://{CLUSTER}.test_schema/test_table1' relation = { RELATION_START_KEY: start_key, RELATION_START_LABEL: 'Table', RELATION_END_KEY: end_key1, RELATION_END_LABEL: 'Table', RELATION_TYPE: TableLineage.ORIGIN_DEPENDENCY_RELATION_TYPE, RELATION_REVERSE_TYPE: TableLineage.DEPENDENCY_ORIGIN_RELATION_TYPE } actual_relations = [ neo4_serializer.serialize_relationship(relation) for relation in relations ] self.assertTrue(len(relations), 2) self.assertTrue(relation in actual_relations)
def _load_openlineage_event(self) -> Any: self.input_file = open(self.table_lineage_file_location, 'r') lineage_event = (json.loads(line) for line in self.input_file) table_lineage = (TableLineage( table_key=lineage['input'], downstream_deps=[ lineage['output'] ]) for lineage in self._extract_dataset_info(lineage_event)) self._iter = table_lineage
def _load_csv(self) -> None: """ Create an iterator to execute sql. """ with open(self.table_lineage_file_location, 'r') as fin: self.table_lineage = [dict(i) for i in csv.DictReader(fin)] results = [] for lineage_dict in self.table_lineage: source_table_key = lineage_dict['source_table_key'] target_table_key = lineage_dict['target_table_key'] lineage = TableLineage(table_key=source_table_key, downstream_deps=[target_table_key]) results.append(lineage) self._iter = iter(results)
def _get_extract_iter(self) -> Iterator[Union[TableMetadata, BadgeMetadata, TableSource, TableLineage]]: """ Generates the extract iterator for all of the model types created by the dbt files. """ dbt_id_to_table_key = {} for tbl_node, manifest_content in self._dbt_manifest['nodes'].items(): if manifest_content['resource_type'] == DBT_MODEL_TYPE and tbl_node in self._dbt_catalog['nodes']: LOGGER.info( 'Extracting dbt {}.{}'.format(manifest_content['schema'], manifest_content[self._model_name_key]) ) catalog_content = self._dbt_catalog['nodes'][tbl_node] tbl_columns: List[ColumnMetadata] = self._get_column_values( manifest_columns=manifest_content['columns'], catalog_columns=catalog_content['columns'] ) desc, desc_src = self._get_table_descriptions(manifest_content) tags, tbl_badges = self._get_table_tags_badges(manifest_content) tbl_metadata = TableMetadata( database=self._default_sanitize(self._database_name), # The dbt "database" is the cluster here cluster=self._default_sanitize(manifest_content['database']), schema=self._default_sanitize(manifest_content['schema']), name=self._default_sanitize(manifest_content[self._model_name_key]), is_view=catalog_content['metadata']['type'] == 'view', columns=tbl_columns, tags=tags, description=desc, description_source=desc_src ) # Keep track for Lineage dbt_id_to_table_key[tbl_node] = tbl_metadata._get_table_key() # Optionally filter schemas in the output yield_schema = self._can_yield_schema(manifest_content['schema']) if self._extract_tables and yield_schema: yield tbl_metadata if self._extract_tags and tbl_badges and yield_schema: yield BadgeMetadata(start_label=TableMetadata.TABLE_NODE_LABEL, start_key=tbl_metadata._get_table_key(), badges=[Badge(badge, 'table') for badge in tbl_badges]) if self._source_url and yield_schema: yield TableSource(db_name=tbl_metadata.database, cluster=tbl_metadata.cluster, schema=tbl_metadata.schema, table_name=tbl_metadata.name, source=os.path.join(self._source_url, manifest_content.get('original_file_path'))) if self._extract_lineage: for upstream, downstreams in self._dbt_manifest['child_map'].items(): if upstream not in dbt_id_to_table_key: continue valid_downstreams = [ dbt_id_to_table_key[k] for k in downstreams if k.startswith(DBT_MODEL_PREFIX) and dbt_id_to_table_key.get(k) ] if valid_downstreams: yield TableLineage( table_key=dbt_id_to_table_key[upstream], downstream_deps=valid_downstreams )
class TestTableLineage(unittest.TestCase): def setUp(self) -> None: super(TestTableLineage, self).setUp() self.table_lineage = TableLineage( db_name='hive', schema=SCHEMA, table_name=TABLE, cluster=CLUSTER, downstream_deps=[ 'hive://default.test_schema/test_table1', 'hive://default.test_schema/test_table2' ]) self.start_key = f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}' self.end_key1 = f'{DB}://{CLUSTER}.test_schema/test_table1' self.end_key2 = f'{DB}://{CLUSTER}.test_schema/test_table2' def test_get_table_model_key(self) -> None: metadata = self.table_lineage.get_table_model_key(db=DB, cluster=CLUSTER, schema=SCHEMA, table=TABLE) self.assertEqual(metadata, 'hive://default.base/test') def test_create_nodes(self) -> None: actual = [] node = self.table_lineage.create_next_node() while node: serialized_node = neo4_serializer.serialize_node(node) actual.append(serialized_node) node = self.table_lineage.create_next_node() self.assertEqual(len(actual), 0) def test_create_relation(self) -> None: expected_relations = [{ RELATION_START_KEY: self.start_key, RELATION_START_LABEL: 'Table', RELATION_END_KEY: self.end_key1, RELATION_END_LABEL: 'Table', RELATION_TYPE: TableLineage.ORIGIN_DEPENDENCY_RELATION_TYPE, RELATION_REVERSE_TYPE: TableLineage.DEPENDENCY_ORIGIN_RELATION_TYPE }, { RELATION_START_KEY: self.start_key, RELATION_START_LABEL: 'Table', RELATION_END_KEY: self.end_key2, RELATION_END_LABEL: 'Table', RELATION_TYPE: TableLineage.ORIGIN_DEPENDENCY_RELATION_TYPE, RELATION_REVERSE_TYPE: TableLineage.DEPENDENCY_ORIGIN_RELATION_TYPE }] actual = [] relation = self.table_lineage.create_next_relation() while relation: serialized_relation = neo4_serializer.serialize_relationship( relation) actual.append(serialized_relation) relation = self.table_lineage.create_next_relation() self.assertEqual(actual, expected_relations) def test_create_relation_neptune(self) -> None: expected = [[{ NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id='Table:' + self.start_key, to_vertex_id='Table:' + self.end_key1, label=TableLineage.ORIGIN_DEPENDENCY_RELATION_TYPE, ), METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id='Table:' + self.start_key, to_vertex_id='Table:' + self.end_key1, label=TableLineage.ORIGIN_DEPENDENCY_RELATION_TYPE, ), NEPTUNE_RELATIONSHIP_HEADER_FROM: 'Table:' + self.start_key, NEPTUNE_RELATIONSHIP_HEADER_TO: 'Table:' + self.end_key1, NEPTUNE_HEADER_LABEL: TableLineage.ORIGIN_DEPENDENCY_RELATION_TYPE, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB }, { NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id='Table:' + self.end_key1, to_vertex_id='Table:' + self.start_key, label=TableLineage.DEPENDENCY_ORIGIN_RELATION_TYPE), METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id='Table:' + self.end_key1, to_vertex_id='Table:' + self.start_key, label=TableLineage.DEPENDENCY_ORIGIN_RELATION_TYPE), NEPTUNE_RELATIONSHIP_HEADER_FROM: 'Table:' + self.end_key1, NEPTUNE_RELATIONSHIP_HEADER_TO: 'Table:' + self.start_key, NEPTUNE_HEADER_LABEL: TableLineage.DEPENDENCY_ORIGIN_RELATION_TYPE, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB }], [{ NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id='Table:' + self.start_key, to_vertex_id='Table:' + self.end_key2, label=TableLineage.ORIGIN_DEPENDENCY_RELATION_TYPE, ), METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id='Table:' + self.start_key, to_vertex_id='Table:' + self.end_key2, label=TableLineage.ORIGIN_DEPENDENCY_RELATION_TYPE, ), NEPTUNE_RELATIONSHIP_HEADER_FROM: 'Table:' + self.start_key, NEPTUNE_RELATIONSHIP_HEADER_TO: 'Table:' + self.end_key2, NEPTUNE_HEADER_LABEL: TableLineage.ORIGIN_DEPENDENCY_RELATION_TYPE, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB }, { NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id='Table:' + self.end_key2, to_vertex_id='Table:' + self.start_key, label=TableLineage.DEPENDENCY_ORIGIN_RELATION_TYPE ), METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT: "{label}:{from_vertex_id}_{to_vertex_id}".format( from_vertex_id='Table:' + self.end_key2, to_vertex_id='Table:' + self.start_key, label=TableLineage.DEPENDENCY_ORIGIN_RELATION_TYPE ), NEPTUNE_RELATIONSHIP_HEADER_FROM: 'Table:' + self.end_key2, NEPTUNE_RELATIONSHIP_HEADER_TO: 'Table:' + self.start_key, NEPTUNE_HEADER_LABEL: TableLineage.DEPENDENCY_ORIGIN_RELATION_TYPE, NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY, NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB }]] actual = [] relation = self.table_lineage.create_next_relation() while relation: serialized_relation = neptune_serializer.convert_relationship( relation) actual.append(serialized_relation) relation = self.table_lineage.create_next_relation() self.assertEqual(actual, expected)