示例#1
0
 def setUp(self) -> None:
     super(TestTableSource, self).setUp()
     self.table_source = TableSource(db_name='hive',
                                     schema=SCHEMA,
                                     table_name=TABLE,
                                     cluster=CLUSTER,
                                     source=SOURCE)
示例#2
0
    def setUp(self) -> None:
        super(TestTableSource, self).setUp()
        self.table_source = TableSource(db_name='hive',
                                        schema=SCHEMA,
                                        table_name=TABLE,
                                        cluster=CLUSTER,
                                        source=SOURCE)

        self.start_key = f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}/_source'
        self.end_key = f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}'
示例#3
0
class TestTableSource(unittest.TestCase):

    def setUp(self):
        # type: () -> None
        super(TestTableSource, self).setUp()
        self.table_source = TableSource(db_name='hive',
                                        schema_name=SCHEMA,
                                        table_name=TABLE,
                                        cluster=CLUSTER,
                                        source=SOURCE)

    def test_get_source_model_key(self):
        # type: () -> None
        source = self.table_source.get_source_model_key()
        self.assertEquals(source, '{db}://{cluster}.{schema}/{tbl}/_source'.format(db=DB,
                                                                                   schema=SCHEMA,
                                                                                   tbl=TABLE,
                                                                                   cluster=CLUSTER,
                                                                                   ))

    def test_get_metadata_model_key(self):
        # type: () -> None
        metadata = self.table_source.get_metadata_model_key()
        self.assertEquals(metadata, 'hive://default.base/test')

    def test_create_nodes(self):
        # type: () -> None
        nodes = self.table_source.create_nodes()
        self.assertEquals(len(nodes), 1)

    def test_create_relation(self):
        # type: () -> None
        relations = self.table_source.create_relation()
        self.assertEquals(len(relations), 1)

        start_key = '{db}://{cluster}.{schema}/{tbl}/_source'.format(db=DB,
                                                                     schema=SCHEMA,
                                                                     tbl=TABLE,
                                                                     cluster=CLUSTER)
        end_key = '{db}://{cluster}.{schema}/{tbl}'.format(db=DB,
                                                           schema=SCHEMA,
                                                           tbl=TABLE,
                                                           cluster=CLUSTER)

        relation = {
            RELATION_START_KEY: start_key,
            RELATION_START_LABEL: TableSource.LABEL,
            RELATION_END_KEY: end_key,
            RELATION_END_LABEL: 'Table',
            RELATION_TYPE: TableSource.SOURCE_TABLE_RELATION_TYPE,
            RELATION_REVERSE_TYPE: TableSource.TABLE_SOURCE_RELATION_TYPE
        }

        self.assertTrue(relation in relations)
示例#4
0
class TestTableSource(unittest.TestCase):

    def setUp(self) -> None:
        super(TestTableSource, self).setUp()
        self.table_source = TableSource(db_name='hive',
                                        schema=SCHEMA,
                                        table_name=TABLE,
                                        cluster=CLUSTER,
                                        source=SOURCE)

    def test_get_source_model_key(self) -> None:
        source = self.table_source.get_source_model_key()
        self.assertEqual(source, f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}/_source')

    def test_get_metadata_model_key(self) -> None:
        metadata = self.table_source.get_metadata_model_key()
        self.assertEqual(metadata, 'hive://default.base/test')

    def test_create_nodes(self) -> None:
        nodes = self.table_source.create_nodes()
        self.assertEqual(len(nodes), 1)

    def test_create_relation(self) -> None:
        relations = self.table_source.create_relation()
        self.assertEquals(len(relations), 1)
        serialized_relation = neo4_serializer.serialize_relationship(relations[0])

        start_key = f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}/_source'
        end_key = f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}'

        expected_relation = {
            RELATION_START_KEY: start_key,
            RELATION_START_LABEL: TableSource.LABEL,
            RELATION_END_KEY: end_key,
            RELATION_END_LABEL: 'Table',
            RELATION_TYPE: TableSource.SOURCE_TABLE_RELATION_TYPE,
            RELATION_REVERSE_TYPE: TableSource.TABLE_SOURCE_RELATION_TYPE
        }

        self.assertDictEqual(expected_relation, serialized_relation)
示例#5
0
class TestTableSource(unittest.TestCase):
    def setUp(self) -> None:
        super(TestTableSource, self).setUp()
        self.table_source = TableSource(db_name='hive',
                                        schema=SCHEMA,
                                        table_name=TABLE,
                                        cluster=CLUSTER,
                                        source=SOURCE)

        self.start_key = f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}/_source'
        self.end_key = f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}'

    def test_get_source_model_key(self) -> None:
        source = self.table_source.get_source_model_key()
        self.assertEqual(source, f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}/_source')

    def test_get_metadata_model_key(self) -> None:
        metadata = self.table_source.get_metadata_model_key()
        self.assertEqual(metadata, 'hive://default.base/test')

    def test_create_nodes(self) -> None:
        nodes = self.table_source.create_nodes()
        self.assertEqual(len(nodes), 1)

    def test_create_relation(self) -> None:
        relations = self.table_source.create_relation()
        self.assertEquals(len(relations), 1)
        serialized_relation = neo4_serializer.serialize_relationship(
            relations[0])

        expected_relation = {
            RELATION_START_KEY: self.start_key,
            RELATION_START_LABEL: TableSource.LABEL,
            RELATION_END_KEY: self.end_key,
            RELATION_END_LABEL: 'Table',
            RELATION_TYPE: TableSource.SOURCE_TABLE_RELATION_TYPE,
            RELATION_REVERSE_TYPE: TableSource.TABLE_SOURCE_RELATION_TYPE
        }

        self.assertDictEqual(expected_relation, serialized_relation)

    def test_create_relation_neptune(self) -> None:
        relations = self.table_source.create_relation()
        serialized_relations = neptune_serializer.convert_relationship(
            relations[0])

        expected = [{
            NEPTUNE_HEADER_ID:
            "{from_vertex_id}_{to_vertex_id}_{label}".format(
                from_vertex_id=self.start_key,
                to_vertex_id=self.end_key,
                label=TableSource.SOURCE_TABLE_RELATION_TYPE),
            NEPTUNE_RELATIONSHIP_HEADER_FROM:
            self.start_key,
            NEPTUNE_RELATIONSHIP_HEADER_TO:
            self.end_key,
            NEPTUNE_HEADER_LABEL:
            TableSource.SOURCE_TABLE_RELATION_TYPE,
            NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            ANY,
            NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            NEPTUNE_CREATION_TYPE_JOB
        }, {
            NEPTUNE_HEADER_ID:
            "{from_vertex_id}_{to_vertex_id}_{label}".format(
                from_vertex_id=self.end_key,
                to_vertex_id=self.start_key,
                label=TableSource.TABLE_SOURCE_RELATION_TYPE),
            NEPTUNE_RELATIONSHIP_HEADER_FROM:
            self.end_key,
            NEPTUNE_RELATIONSHIP_HEADER_TO:
            self.start_key,
            NEPTUNE_HEADER_LABEL:
            TableSource.TABLE_SOURCE_RELATION_TYPE,
            NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            ANY,
            NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT:
            NEPTUNE_CREATION_TYPE_JOB
        }]

        self.assertListEqual(expected, serialized_relations)
示例#6
0
    def _get_extract_iter(self) -> Iterator[Union[TableMetadata, BadgeMetadata, TableSource, TableLineage]]:
        """
        Generates the extract iterator for all of the model types created by the dbt files.
        """
        dbt_id_to_table_key = {}
        for tbl_node, manifest_content in self._dbt_manifest['nodes'].items():

            if manifest_content['resource_type'] == DBT_MODEL_TYPE and tbl_node in self._dbt_catalog['nodes']:
                LOGGER.info(
                    'Extracting dbt {}.{}'.format(manifest_content['schema'], manifest_content[self._model_name_key])
                )

                catalog_content = self._dbt_catalog['nodes'][tbl_node]

                tbl_columns: List[ColumnMetadata] = self._get_column_values(
                    manifest_columns=manifest_content['columns'], catalog_columns=catalog_content['columns']
                )

                desc, desc_src = self._get_table_descriptions(manifest_content)
                tags, tbl_badges = self._get_table_tags_badges(manifest_content)

                tbl_metadata = TableMetadata(
                    database=self._default_sanitize(self._database_name),
                    # The dbt "database" is the cluster here
                    cluster=self._default_sanitize(manifest_content['database']),
                    schema=self._default_sanitize(manifest_content['schema']),
                    name=self._default_sanitize(manifest_content[self._model_name_key]),
                    is_view=catalog_content['metadata']['type'] == 'view',
                    columns=tbl_columns,
                    tags=tags,
                    description=desc,
                    description_source=desc_src
                )
                # Keep track for Lineage
                dbt_id_to_table_key[tbl_node] = tbl_metadata._get_table_key()

                # Optionally filter schemas in the output
                yield_schema = self._can_yield_schema(manifest_content['schema'])

                if self._extract_tables and yield_schema:
                    yield tbl_metadata

                if self._extract_tags and tbl_badges and yield_schema:
                    yield BadgeMetadata(start_label=TableMetadata.TABLE_NODE_LABEL,
                                        start_key=tbl_metadata._get_table_key(),
                                        badges=[Badge(badge, 'table') for badge in tbl_badges])

                if self._source_url and yield_schema:
                    yield TableSource(db_name=tbl_metadata.database,
                                      cluster=tbl_metadata.cluster,
                                      schema=tbl_metadata.schema,
                                      table_name=tbl_metadata.name,
                                      source=os.path.join(self._source_url, manifest_content.get('original_file_path')))

        if self._extract_lineage:
            for upstream, downstreams in self._dbt_manifest['child_map'].items():
                if upstream not in dbt_id_to_table_key:
                    continue
                valid_downstreams = [
                    dbt_id_to_table_key[k] for k in downstreams
                    if k.startswith(DBT_MODEL_PREFIX) and dbt_id_to_table_key.get(k)
                ]
                if valid_downstreams:
                    yield TableLineage(
                        table_key=dbt_id_to_table_key[upstream],
                        downstream_deps=valid_downstreams
                    )
class TestTableSource(unittest.TestCase):

    def setUp(self) -> None:
        super(TestTableSource, self).setUp()
        self.table_source = TableSource(db_name='hive',
                                        schema=SCHEMA,
                                        table_name=TABLE,
                                        cluster=CLUSTER,
                                        source=SOURCE)

        self.start_key = f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}/_source'
        self.end_key = f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}'

    def test_get_source_model_key(self) -> None:
        source = self.table_source.get_source_model_key()
        self.assertEqual(source, f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}/_source')

    def test_get_metadata_model_key(self) -> None:
        metadata = self.table_source.get_metadata_model_key()
        self.assertEqual(metadata, 'hive://default.base/test')

    def test_create_nodes(self) -> None:
        expected_nodes = [{
            'LABEL': 'Source',
            'KEY': f'{DB}://{CLUSTER}.{SCHEMA}/{TABLE}/_source',
            'source': SOURCE,
            'source_type': 'github'
        }]

        actual = []
        node = self.table_source.create_next_node()
        while node:
            serialized_node = neo4_serializer.serialize_node(node)
            actual.append(serialized_node)
            node = self.table_source.create_next_node()

        self.assertEqual(expected_nodes, actual)

    def test_create_relation(self) -> None:
        expected_relations = [{
            RELATION_START_KEY: self.start_key,
            RELATION_START_LABEL: TableSource.LABEL,
            RELATION_END_KEY: self.end_key,
            RELATION_END_LABEL: 'Table',
            RELATION_TYPE: TableSource.SOURCE_TABLE_RELATION_TYPE,
            RELATION_REVERSE_TYPE: TableSource.TABLE_SOURCE_RELATION_TYPE
        }]

        actual = []
        relation = self.table_source.create_next_relation()
        while relation:
            serialized_relation = neo4_serializer.serialize_relationship(relation)
            actual.append(serialized_relation)
            relation = self.table_source.create_next_relation()

        self.assertEqual(expected_relations, actual)

    def test_create_relation_neptune(self) -> None:
        actual = []
        relation = self.table_source.create_next_relation()
        while relation:
            serialized_relation = neptune_serializer.convert_relationship(relation)
            actual.append(serialized_relation)
            relation = self.table_source.create_next_relation()

        expected = [
            [
                {
                    NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format(
                        from_vertex_id="Source:" + self.start_key,
                        to_vertex_id="Table:" + self.end_key,
                        label=TableSource.SOURCE_TABLE_RELATION_TYPE
                    ),
                    METADATA_KEY_PROPERTY_NAME: "{label}:{from_vertex_id}_{to_vertex_id}".format(
                        from_vertex_id="Source:" + self.start_key,
                        to_vertex_id="Table:" + self.end_key,
                        label=TableSource.SOURCE_TABLE_RELATION_TYPE
                    ),
                    NEPTUNE_RELATIONSHIP_HEADER_FROM: "Source:" + self.start_key,
                    NEPTUNE_RELATIONSHIP_HEADER_TO: "Table:" + self.end_key,
                    NEPTUNE_HEADER_LABEL: TableSource.SOURCE_TABLE_RELATION_TYPE,
                    NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY,
                    NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB
                },
                {
                    NEPTUNE_HEADER_ID: "{label}:{from_vertex_id}_{to_vertex_id}".format(
                        from_vertex_id="Table:" + self.end_key,
                        to_vertex_id="Source:" + self.start_key,
                        label=TableSource.TABLE_SOURCE_RELATION_TYPE
                    ),
                    METADATA_KEY_PROPERTY_NAME: "{label}:{from_vertex_id}_{to_vertex_id}".format(
                        from_vertex_id="Table:" + self.end_key,
                        to_vertex_id="Source:" + self.start_key,
                        label=TableSource.TABLE_SOURCE_RELATION_TYPE
                    ),
                    NEPTUNE_RELATIONSHIP_HEADER_FROM: "Table:" + self.end_key,
                    NEPTUNE_RELATIONSHIP_HEADER_TO: "Source:" + self.start_key,
                    NEPTUNE_HEADER_LABEL: TableSource.TABLE_SOURCE_RELATION_TYPE,
                    NEPTUNE_LAST_EXTRACTED_AT_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: ANY,
                    NEPTUNE_CREATION_TYPE_RELATIONSHIP_PROPERTY_NAME_BULK_LOADER_FORMAT: NEPTUNE_CREATION_TYPE_JOB
                }
            ]
        ]

        self.assertListEqual(expected, actual)

    def test_create_records(self) -> None:
        expected = [{
            'rk': self.table_source.get_source_model_key(),
            'source': self.table_source.source,
            'source_type': self.table_source.source_type,
            'table_rk': self.table_source.get_metadata_model_key()
        }]

        actual = []
        record = self.table_source.create_next_record()
        while record:
            serialized_record = mysql_serializer.serialize_record(record)
            actual.append(serialized_record)
            record = self.table_source.create_next_record()

        self.assertEqual(expected, actual)