Exemplo n.º 1
0
 def setUp(self) -> None:
     super(TestTableLineage, self).setUp()
     self.table_lineage = TableLineage(db_name='hive',
                                       schema=SCHEMA,
                                       table_name=TABLE,
                                       cluster=CLUSTER,
                                       downstream_deps=['hive://default.test_schema/test_table1',
                                                        'hive://default.test_schema/test_table2'])
    def setUp(self) -> None:
        super(TestTableLineage, self).setUp()
        self.table_lineage = TableLineage(table_key=f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}',
                                          downstream_deps=['hive://default.test_schema/test_table1',
                                                           'hive://default.test_schema/test_table2'])

        self.start_key = f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}'
        self.end_key1 = f'{DB}://{CLUSTER}.test_schema/test_table1'
        self.end_key2 = f'{DB}://{CLUSTER}.test_schema/test_table2'
Exemplo n.º 3
0
class TestTableLineage(unittest.TestCase):
    def setUp(self):
        # type: () -> None
        super(TestTableLineage, self).setUp()
        self.table_lineage = TableLineage(
            db_name='hive',
            schema_name=SCHEMA,
            table_name=TABLE,
            cluster=CLUSTER,
            downstream_deps=[
                'hive://default.test_schema/test_table1',
                'hive://default.test_schema/test_table2'
            ])

    def test_get_table_model_key(self):
        # type: () -> None
        metadata = self.table_lineage.get_table_model_key(db=DB,
                                                          cluster=CLUSTER,
                                                          schema=SCHEMA,
                                                          table=TABLE)
        self.assertEquals(metadata, 'hive://default.base/test')

    def test_create_nodes(self):
        # type: () -> None
        nodes = self.table_lineage.create_nodes()
        self.assertEquals(len(nodes), 0)

    def test_create_relation(self):
        # type: () -> None
        relations = self.table_lineage.create_relation()
        self.assertEquals(len(relations), 2)

        start_key = '{db}://{cluster}.{schema}/{tbl}'.format(db=DB,
                                                             schema=SCHEMA,
                                                             tbl=TABLE,
                                                             cluster=CLUSTER)
        end_key1 = '{db}://{cluster}.{schema}/{tbl}'.format(
            db=DB, schema='test_schema', tbl='test_table1', cluster=CLUSTER)

        relation = {
            RELATION_START_KEY: start_key,
            RELATION_START_LABEL: 'Table',
            RELATION_END_KEY: end_key1,
            RELATION_END_LABEL: 'Table',
            RELATION_TYPE: TableLineage.ORIGIN_DEPENDENCY_RELATION_TYPE,
            RELATION_REVERSE_TYPE: TableLineage.DEPENDENCY_ORIGIN_RELATION_TYPE
        }
        self.assertTrue(len(relations), 2)
        self.assertTrue(relation in relations)
Exemplo n.º 4
0
class TestTableLineage(unittest.TestCase):
    def setUp(self) -> None:
        super(TestTableLineage, self).setUp()
        self.table_lineage = TableLineage(
            db_name='hive',
            schema=SCHEMA,
            table_name=TABLE,
            cluster=CLUSTER,
            downstream_deps=[
                'hive://default.test_schema/test_table1',
                'hive://default.test_schema/test_table2'
            ])

    def test_get_table_model_key(self) -> None:
        metadata = self.table_lineage.get_table_model_key(db=DB,
                                                          cluster=CLUSTER,
                                                          schema=SCHEMA,
                                                          table=TABLE)
        self.assertEqual(metadata, 'hive://default.base/test')

    def test_create_nodes(self) -> None:
        nodes = self.table_lineage.create_nodes()
        self.assertEqual(len(nodes), 0)

    def test_create_relation(self) -> None:
        relations = self.table_lineage.create_relation()
        self.assertEqual(len(relations), 2)

        start_key = f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}'
        end_key1 = f'{DB}://{CLUSTER}.test_schema/test_table1'

        relation = {
            RELATION_START_KEY: start_key,
            RELATION_START_LABEL: 'Table',
            RELATION_END_KEY: end_key1,
            RELATION_END_LABEL: 'Table',
            RELATION_TYPE: TableLineage.ORIGIN_DEPENDENCY_RELATION_TYPE,
            RELATION_REVERSE_TYPE: TableLineage.DEPENDENCY_ORIGIN_RELATION_TYPE
        }
        actual_relations = [
            neo4_serializer.serialize_relationship(relation)
            for relation in relations
        ]
        self.assertTrue(len(relations), 2)
        self.assertTrue(relation in actual_relations)
Exemplo n.º 5
0
    def _load_openlineage_event(self) -> Any:

        self.input_file = open(self.table_lineage_file_location, 'r')

        lineage_event = (json.loads(line) for line in self.input_file)

        table_lineage = (TableLineage(
            table_key=lineage['input'], downstream_deps=[
                lineage['output']
            ]) for lineage in self._extract_dataset_info(lineage_event))
        self._iter = table_lineage
Exemplo n.º 6
0
    def _load_csv(self) -> None:
        """
        Create an iterator to execute sql.
        """

        with open(self.table_lineage_file_location, 'r') as fin:
            self.table_lineage = [dict(i) for i in csv.DictReader(fin)]

        results = []
        for lineage_dict in self.table_lineage:
            source_table_key = lineage_dict['source_table_key']
            target_table_key = lineage_dict['target_table_key']
            lineage = TableLineage(table_key=source_table_key,
                                   downstream_deps=[target_table_key])
            results.append(lineage)

        self._iter = iter(results)
Exemplo n.º 7
0
    def _get_extract_iter(self) -> Iterator[Union[TableMetadata, BadgeMetadata, TableSource, TableLineage]]:
        """
        Generates the extract iterator for all of the model types created by the dbt files.
        """
        dbt_id_to_table_key = {}
        for tbl_node, manifest_content in self._dbt_manifest['nodes'].items():

            if manifest_content['resource_type'] == DBT_MODEL_TYPE and tbl_node in self._dbt_catalog['nodes']:
                LOGGER.info(
                    'Extracting dbt {}.{}'.format(manifest_content['schema'], manifest_content[self._model_name_key])
                )

                catalog_content = self._dbt_catalog['nodes'][tbl_node]

                tbl_columns: List[ColumnMetadata] = self._get_column_values(
                    manifest_columns=manifest_content['columns'], catalog_columns=catalog_content['columns']
                )

                desc, desc_src = self._get_table_descriptions(manifest_content)
                tags, tbl_badges = self._get_table_tags_badges(manifest_content)

                tbl_metadata = TableMetadata(
                    database=self._default_sanitize(self._database_name),
                    # The dbt "database" is the cluster here
                    cluster=self._default_sanitize(manifest_content['database']),
                    schema=self._default_sanitize(manifest_content['schema']),
                    name=self._default_sanitize(manifest_content[self._model_name_key]),
                    is_view=catalog_content['metadata']['type'] == 'view',
                    columns=tbl_columns,
                    tags=tags,
                    description=desc,
                    description_source=desc_src
                )
                # Keep track for Lineage
                dbt_id_to_table_key[tbl_node] = tbl_metadata._get_table_key()

                # Optionally filter schemas in the output
                yield_schema = self._can_yield_schema(manifest_content['schema'])

                if self._extract_tables and yield_schema:
                    yield tbl_metadata

                if self._extract_tags and tbl_badges and yield_schema:
                    yield BadgeMetadata(start_label=TableMetadata.TABLE_NODE_LABEL,
                                        start_key=tbl_metadata._get_table_key(),
                                        badges=[Badge(badge, 'table') for badge in tbl_badges])

                if self._source_url and yield_schema:
                    yield TableSource(db_name=tbl_metadata.database,
                                      cluster=tbl_metadata.cluster,
                                      schema=tbl_metadata.schema,
                                      table_name=tbl_metadata.name,
                                      source=os.path.join(self._source_url, manifest_content.get('original_file_path')))

        if self._extract_lineage:
            for upstream, downstreams in self._dbt_manifest['child_map'].items():
                if upstream not in dbt_id_to_table_key:
                    continue
                valid_downstreams = [
                    dbt_id_to_table_key[k] for k in downstreams
                    if k.startswith(DBT_MODEL_PREFIX) and dbt_id_to_table_key.get(k)
                ]
                if valid_downstreams:
                    yield TableLineage(
                        table_key=dbt_id_to_table_key[upstream],
                        downstream_deps=valid_downstreams
                    )
class TestTableLineage(unittest.TestCase):
    def setUp(self) -> None:
        super(TestTableLineage, self).setUp()
        self.table_lineage = TableLineage(
            db_name='hive',
            schema=SCHEMA,
            table_name=TABLE,
            cluster=CLUSTER,
            downstream_deps=[
                'hive://default.test_schema/test_table1',
                'hive://default.test_schema/test_table2'
            ])

        self.start_key = f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}'
        self.end_key1 = f'{DB}://{CLUSTER}.test_schema/test_table1'
        self.end_key2 = f'{DB}://{CLUSTER}.test_schema/test_table2'

    def test_get_table_model_key(self) -> None:
        metadata = self.table_lineage.get_table_model_key(db=DB,
                                                          cluster=CLUSTER,
                                                          schema=SCHEMA,
                                                          table=TABLE)
        self.assertEqual(metadata, 'hive://default.base/test')

    def test_create_nodes(self) -> None:
        actual = []
        node = self.table_lineage.create_next_node()
        while node:
            serialized_node = neo4_serializer.serialize_node(node)
            actual.append(serialized_node)
            node = self.table_lineage.create_next_node()

        self.assertEqual(len(actual), 0)

    def test_create_relation(self) -> None:
        expected_relations = [{
            RELATION_START_KEY:
            self.start_key,
            RELATION_START_LABEL:
            'Table',
            RELATION_END_KEY:
            self.end_key1,
            RELATION_END_LABEL:
            'Table',
            RELATION_TYPE:
            TableLineage.ORIGIN_DEPENDENCY_RELATION_TYPE,
            RELATION_REVERSE_TYPE:
            TableLineage.DEPENDENCY_ORIGIN_RELATION_TYPE
        }, {
            RELATION_START_KEY:
            self.start_key,
            RELATION_START_LABEL:
            'Table',
            RELATION_END_KEY:
            self.end_key2,
            RELATION_END_LABEL:
            'Table',
            RELATION_TYPE:
            TableLineage.ORIGIN_DEPENDENCY_RELATION_TYPE,
            RELATION_REVERSE_TYPE:
            TableLineage.DEPENDENCY_ORIGIN_RELATION_TYPE
        }]

        actual = []
        relation = self.table_lineage.create_next_relation()
        while relation:
            serialized_relation = neo4_serializer.serialize_relationship(
                relation)
            actual.append(serialized_relation)
            relation = self.table_lineage.create_next_relation()

        self.assertEqual(actual, expected_relations)

    def test_create_relation_neptune(self) -> None:
        expected = [[{
            NEPTUNE_HEADER_ID:
            "{label}:{from_vertex_id}_{to_vertex_id}".format(
                from_vertex_id='Table:' + self.start_key,
                to_vertex_id='Table:' + self.end_key1,
                label=TableLineage.ORIGIN_DEPENDENCY_RELATION_TYPE,
            ),
            METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT:
            "{label}:{from_vertex_id}_{to_vertex_id}".format(
                from_vertex_id='Table:' + self.start_key,
                to_vertex_id='Table:' + self.end_key1,
                label=TableLineage.ORIGIN_DEPENDENCY_RELATION_TYPE,
            ),
            NEPTUNE_RELATIONSHIP_HEADER_FROM:
            'Table:' + self.start_key,
            NEPTUNE_RELATIONSHIP_HEADER_TO:
            'Table:' + self.end_key1,
            NEPTUNE_HEADER_LABEL:
            TableLineage.ORIGIN_DEPENDENCY_RELATION_TYPE,
            NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            ANY,
            NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            NEPTUNE_CREATION_TYPE_JOB
        }, {
            NEPTUNE_HEADER_ID:
            "{label}:{from_vertex_id}_{to_vertex_id}".format(
                from_vertex_id='Table:' + self.end_key1,
                to_vertex_id='Table:' + self.start_key,
                label=TableLineage.DEPENDENCY_ORIGIN_RELATION_TYPE),
            METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT:
            "{label}:{from_vertex_id}_{to_vertex_id}".format(
                from_vertex_id='Table:' + self.end_key1,
                to_vertex_id='Table:' + self.start_key,
                label=TableLineage.DEPENDENCY_ORIGIN_RELATION_TYPE),
            NEPTUNE_RELATIONSHIP_HEADER_FROM:
            'Table:' + self.end_key1,
            NEPTUNE_RELATIONSHIP_HEADER_TO:
            'Table:' + self.start_key,
            NEPTUNE_HEADER_LABEL:
            TableLineage.DEPENDENCY_ORIGIN_RELATION_TYPE,
            NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            ANY,
            NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            NEPTUNE_CREATION_TYPE_JOB
        }],
                    [{
                        NEPTUNE_HEADER_ID:
                        "{label}:{from_vertex_id}_{to_vertex_id}".format(
                            from_vertex_id='Table:' + self.start_key,
                            to_vertex_id='Table:' + self.end_key2,
                            label=TableLineage.ORIGIN_DEPENDENCY_RELATION_TYPE,
                        ),
                        METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT:
                        "{label}:{from_vertex_id}_{to_vertex_id}".format(
                            from_vertex_id='Table:' + self.start_key,
                            to_vertex_id='Table:' + self.end_key2,
                            label=TableLineage.ORIGIN_DEPENDENCY_RELATION_TYPE,
                        ),
                        NEPTUNE_RELATIONSHIP_HEADER_FROM:
                        'Table:' + self.start_key,
                        NEPTUNE_RELATIONSHIP_HEADER_TO:
                        'Table:' + self.end_key2,
                        NEPTUNE_HEADER_LABEL:
                        TableLineage.ORIGIN_DEPENDENCY_RELATION_TYPE,
                        NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
                        ANY,
                        NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
                        NEPTUNE_CREATION_TYPE_JOB
                    }, {
                        NEPTUNE_HEADER_ID:
                        "{label}:{from_vertex_id}_{to_vertex_id}".format(
                            from_vertex_id='Table:' + self.end_key2,
                            to_vertex_id='Table:' + self.start_key,
                            label=TableLineage.DEPENDENCY_ORIGIN_RELATION_TYPE
                        ),
                        METADATA_KEY_PROPERTY_NAME_BULK_LOADER_FORMAT:
                        "{label}:{from_vertex_id}_{to_vertex_id}".format(
                            from_vertex_id='Table:' + self.end_key2,
                            to_vertex_id='Table:' + self.start_key,
                            label=TableLineage.DEPENDENCY_ORIGIN_RELATION_TYPE
                        ),
                        NEPTUNE_RELATIONSHIP_HEADER_FROM:
                        'Table:' + self.end_key2,
                        NEPTUNE_RELATIONSHIP_HEADER_TO:
                        'Table:' + self.start_key,
                        NEPTUNE_HEADER_LABEL:
                        TableLineage.DEPENDENCY_ORIGIN_RELATION_TYPE,
                        NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
                        ANY,
                        NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
                        NEPTUNE_CREATION_TYPE_JOB
                    }]]

        actual = []
        relation = self.table_lineage.create_next_relation()
        while relation:
            serialized_relation = neptune_serializer.convert_relationship(
                relation)
            actual.append(serialized_relation)
            relation = self.table_lineage.create_next_relation()

        self.assertEqual(actual, expected)