예제 #1
0
    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        for row in self._get_raw_extract_iter():
            columns, i = [], 0

            for column in row["StorageDescriptor"]["Columns"] + row.get(
                    "PartitionKeys", []):
                columns.append(
                    ColumnMetadata(
                        column["Name"],
                        column["Comment"] if "Comment" in column else None,
                        column["Type"],
                        i,
                    ))
                i += 1

            catalog, schema, table = self._parse_location(
                location=row["StorageDescriptor"]["Location"],
                name=row["Name"])

            if self._connection_name:
                database = self._connection_name + "/" + row["DatabaseName"]
            else:
                database = row["DatabaseName"]

            yield TableMetadata(
                database,
                catalog,
                schema,
                table,
                row.get("Description")
                or row.get("Parameters", {}).get("comment"),
                columns,
                row.get("TableType") == "VIRTUAL_VIEW",
            )
    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        """
        Using itertools.groupby and raw level iterator, it groups to table and
        yields TableMetadata
        :return:
        """

        for _, group in groupby(self._get_raw_extract_iter(),
                                self._get_table_key):
            columns = []
            for row in group:
                last_row = row
                columns.append(
                    ColumnMetadata(
                        name=row["column_name"],
                        description=None,
                        col_type=row["column_type"],
                        sort_order=row["column_sort_order"],
                    ))

            yield TableMetadata(
                database=self._database,
                cluster=None,
                schema=last_row["schema_name"],
                name=last_row["table_name"],
                description=None,
                columns=columns,
                is_view=last_row["table_type"] == "V",
            )
    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        """
        Using itertools.groupby and raw level iterator, it groups to table and
        yields TableMetadata
        :return:
        """
        for _, group in groupby(self._get_raw_extract_iter(),
                                self._get_table_key):
            columns = []
            for row in group:
                column_description = \
                    unidecode(row['col_description']) \
                    if row['col_description'] else None
                last_row = row
                columns.append(
                    ColumnMetadata(name=row['col_name'],
                                   description=column_description,
                                   col_type=row['col_type'],
                                   sort_order=row['col_sort_order']))

            description = \
                unidecode(last_row['description']) \
                if last_row['description'] else None

            yield TableMetadata(database=self._database,
                                cluster=last_row['cluster'],
                                schema=last_row['schema'],
                                name=last_row['name'],
                                description=description,
                                columns=columns,
                                is_view=last_row['is_view'] == 'true')
예제 #4
0
    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """
        for key, group in groupby(self._get_raw_extract_iter(), self._get_table_key):
            columns = []

            for row in group:
                last_row = row
                columns.append(
                    ColumnMetadata(
                        row["col_name"],
                        row["col_description"],
                        row["data_type"],
                        row["col_sort_order"],
                    )
                )

            yield TableMetadata(
                self._database,
                last_row["cluster"],
                last_row["schema"],
                last_row["name"],
                last_row["description"],
                columns,
            )
    def _get_extract_iter(self) -> Iterator[TableMetadata]:
        """
        Using itertools.groupby and raw level iterator, it groups to table and
        yields TableMetadata
        :return:
        """
        for _, group in groupby(self._get_raw_extract_iter(),
                                self._get_table_key):
            columns = []
            for row in group:
                column_description = (unidecode(row["col_description"])
                                      if row["col_description"] else None)
                last_row = row
                columns.append(
                    ColumnMetadata(
                        name=row["col_name"],
                        description=column_description,
                        data_type=row["data_type"],
                        sort_order=row["col_sort_order"],
                    ))

            description = (unidecode(last_row["description"])
                           if last_row["description"] else None)

            yield TableMetadata(
                database=self._database,
                cluster=last_row["cluster"],
                schema=last_row["schema"],
                name=last_row["name"],
                description=description,
                columns=columns,
                is_view=last_row["is_view"] == "true",
            )
예제 #6
0
    def setUp(self) -> None:
        logging.basicConfig(level=logging.INFO)
        self.database = "mock_database"
        self.schema = "mock_schema"
        self.table = "mock_table"

        tmp_path = tempfile.NamedTemporaryFile()
        self.table_stub_path = os.path.join(tmp_path.name, self.database,
                                            f"{self.schema}.{self.table}.md")

        self.loader_config = ConfigFactory.from_dict({
            "table_stub_paths": [self.table_stub_path],
            "base_directory":
            tmp_path.name,
        })

        self.extractor_config = ConfigFactory.from_dict({
            MetricRunner.DATABASE_KEY:
            self.database,
            "table_stub_paths": [self.table_stub_path],
        })

        self.record = TableMetadata(
            database=self.database,
            cluster=None,
            schema=self.schema,
            name=self.table,
        )
    def test_transformed_record_contains_components(self):
        """"""
        column = ColumnMetadata(
            name=COLUMN,
            col_type="Integer",
            sort_order=0,
            description=COLUMN_DESCRIPTION,
        )
        record = TableMetadata(
            database=DATABASE,
            cluster=CLUSTER,
            schema=SCHEMA,
            name=TABLE,
            columns=[column],
        )
        components = [
            DATABASE,
            CLUSTER,
            SCHEMA,
            TABLE,
            COLUMN,
            COLUMN_DESCRIPTION,
        ]
        transformer = MarkdownTransformer()
        transformer.init(self._conf)
        transformed_record = transformer.transform(record)
        markdown_blob = transformed_record.markdown_blob
        transformer.close()

        has_components = all(x in markdown_blob for x in components)

        self.assertEqual(has_components, True)
예제 #8
0
    def _retrieve_tables(self, dataset) -> Any:
        for page in self._page_table_list_results(dataset):
            if "tables" not in page:
                continue

            for table in page["tables"]:
                tableRef = table["tableReference"]
                table_id = tableRef["tableId"]

                # BigQuery tables that have 8 digits as last characters are
                # considered date range tables and are grouped together in the
                # UI. (e.g. ga_sessions_20190101, ga_sessions_20190102, etc.)

                if self._is_sharded_table(table_id):
                    # If the last eight characters are digits, we assume the
                    # table is of a table date range type and then we only need
                    # one schema definition
                    table_prefix = table_id[:-BigQueryMetadataExtractor.
                                            DATE_LENGTH]
                    if table_prefix in self.grouped_tables:
                        # If one table in the date range is processed, then
                        # ignore other ones (it adds too much metadata)
                        continue

                    table_id = table_prefix
                    self.grouped_tables.add(table_prefix)

                table = (self.bigquery_service.tables().get(
                    projectId=tableRef["projectId"],
                    datasetId=tableRef["datasetId"],
                    tableId=tableRef["tableId"],
                ).execute(num_retries=BigQueryMetadataExtractor.NUM_RETRIES))

                cols = []
                if self._is_table_match_regex(tableRef):
                    # BigQuery tables also have interesting metadata about
                    # partitioning data location (EU/US), mod/create time, etc...
                    # Extract that some other time?
                    # Not all tables have schemas
                    if "schema" in table:
                        schema = table["schema"]
                        if "fields" in schema:
                            total_cols = 0
                            for column in schema["fields"]:
                                total_cols = self._iterate_over_cols(
                                    "", column, cols, total_cols + 1)

                table_meta = TableMetadata(
                    database="bigquery",
                    cluster=tableRef["projectId"],
                    schema=tableRef["datasetId"],
                    name=table_id,
                    description=table.get("description", ""),
                    columns=cols,
                    is_view=table["type"] == "VIEW",
                )

                yield (table_meta)
예제 #9
0
    def get_table_metadata(
        self,
        schema: str,
        table: str,
        cluster: Optional[str] = None,
        is_view_query_enabled: Optional[bool] = False,
    ):
        # Format table and schema addresses for queries.
        full_schema_address = self._get_full_schema_address(cluster, schema)
        full_table_address = "{}.{}".format(full_schema_address, table)

        # Execute query that gets column type + partition information.
        columns_query = "show columns in {}".format(full_table_address)
        column_query_results = self.execute(columns_query, has_header=True)
        column_query_field_names = next(column_query_results)
        columns = []
        for i, column_query_result in enumerate(column_query_results):
            column_dict = dict(zip(column_query_field_names, column_query_result))
            columns.append(
                ColumnMetadata(
                    name=column_dict["Column"],
                    description=column_dict["Comment"],
                    data_type=column_dict["Type"],
                    sort_order=i,
                    is_partition_column=column_dict["Extra"] == "partition key",
                )
            )

        if is_view_query_enabled:
            # Execute query that returns if table is a view.
            view_query = """
                select table_type
                from information_schema.tables
                where table_schema='{table_schema}'
                  and table_name='{table_name}'
                """.format(
                table_schema=schema, table_name=table
            )
            view_query_results = self.execute(view_query, has_header=False)
            is_view = next(view_query_results)[0] == "VIEW"
        else:
            is_view = False

        return TableMetadata(
            database=self._database,
            cluster=cluster,
            schema=schema,
            name=table,
            description=None,
            columns=columns,
            is_view=is_view,
        )
    def _get_extract_iter(self):
        with self.driver.session() as session:
            if not hasattr(self, "results"):
                self.results = session.read_transaction(self._execute_query)

            for result in self.results:
                # Parse watermark information.
                partition_columns = []
                for watermark in result["watermarks"]:
                    partition_columns.append(watermark["partition_key"])

                # Parse column information.
                column_names = result["column_names"]
                column_descriptions = result["column_descriptions"]
                column_types = result["column_types"]
                column_sort_orders = result["column_sort_orders"]
                zipped_columns = zip_longest(column_names, column_descriptions,
                                             column_types, column_sort_orders)

                column_metadatas = []
                for (
                        column_name,
                        column_description,
                        column_type,
                        column_sort_order,
                ) in zipped_columns:
                    if column_name in partition_columns:
                        is_partition_column = True
                    else:
                        is_partition_column = False
                    column_metadatas.append(
                        ColumnMetadata(
                            name=column_name,
                            description=column_description,
                            col_type=column_type,
                            sort_order=column_sort_order,
                            is_partition_column=is_partition_column,
                        ))

                yield TableMetadata(
                    database=result["database"],
                    cluster=result["cluster"],
                    schema=result["schema"],
                    name=result["name"],
                    description=result["description"],
                    columns=column_metadatas,
                    is_view=result["is_view"],
                    tags=result["tags"],
                )
예제 #11
0
    def test_extraction_with_single_result(self):
        with patch.object(
            splice_machine_metadata_extractor, "splice_connect"
        ) as mock_connect:
            column = ColumnMetadata("column1", None, "int", 0)
            table = TableMetadata(
                self.DATABASE,
                self.CLUSTER,
                "test_schema",
                "test_table",
                None,
                [column],
            )

            # Connection returns a cursor
            mock_cursor = MagicMock()
            mock_execute = MagicMock()
            mock_fetchall = MagicMock()

            # self.connection = splice_connect(...)
            mock_connection = MagicMock()
            mock_connect.return_value = mock_connection
            # self.cursor = self.connection.cursor()
            mock_connection.cursor.return_value = mock_cursor

            # self.cursor.execute(...)
            mock_cursor.execute = mock_execute

            # for row in self.cursor.fetchall()
            mock_cursor.fetchall = mock_fetchall

            mock_fetchall.return_value = [
                [
                    table.schema,
                    table.name,
                    "not-a-view",
                    column.name,
                    column.sort_order,
                    column.type,
                ]
            ]

            extractor = self.Extractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = table

            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
예제 #12
0
 def test_get_all_table_metadata_from_information_schema(
         self, mock_settings) -> None:
     self.engine.init(self.conf)
     self.engine.execute = MagicMock(
         side_effect=presto_engine_execute_side_effect)
     mock_columns = [
         ColumnMetadata(
             name=MOCK_INFORMATION_SCHEMA_RESULT_1["col_name"],
             description=MOCK_INFORMATION_SCHEMA_RESULT_1[
                 "col_description"],  # noqa: 501
             data_type=MOCK_INFORMATION_SCHEMA_RESULT_1["data_type"],
             sort_order=MOCK_INFORMATION_SCHEMA_RESULT_1["col_sort_order"],
             is_partition_column=None,
         ),
         ColumnMetadata(
             name=MOCK_INFORMATION_SCHEMA_RESULT_2["col_name"],
             description=MOCK_INFORMATION_SCHEMA_RESULT_2[
                 "col_description"],  # noqa: 501
             data_type=MOCK_INFORMATION_SCHEMA_RESULT_2["data_type"],
             sort_order=MOCK_INFORMATION_SCHEMA_RESULT_2["col_sort_order"],
             is_partition_column=None,
         ),
     ]
     expected = TableMetadata(
         database=MOCK_DATABASE_NAME,
         cluster=MOCK_CLUSTER_NAME,
         schema=MOCK_SCHEMA_NAME,
         name=MOCK_TABLE_NAME,
         columns=mock_columns,
         is_view=bool(MOCK_INFORMATION_SCHEMA_RESULT_1["is_view"]),
     )
     results = self.engine.get_all_table_metadata_from_information_schema(
         cluster=MOCK_CLUSTER_NAME)
     result = next(results)
     self.maxDiff = None
     self.assertEqual(result.__repr__(), expected.__repr__())
    def _get_extract_iter(self):
        with self.driver.session() as session:
            if not hasattr(self, 'results'):
                self.results = session.read_transaction(self._execute_query)

            for result in self.results:
                # Parse watermark information.
                partition_columns = []
                for watermark in result['watermarks']:
                    partition_columns.append(watermark['partition_key'])

                # Parse column information.
                column_names = result['column_names']
                column_descriptions = result['column_descriptions']
                column_types = result['column_types']
                column_sort_orders = result['column_sort_orders']
                zipped_columns = zip_longest(column_names, column_descriptions,
                                             column_types, column_sort_orders)

                column_metadatas = []
                for column_name, \
                        column_description, \
                        column_type, \
                        column_sort_order \
                        in zipped_columns:
                    if column_name in partition_columns:
                        is_partition_column = True
                    else:
                        is_partition_column = False
                    column_metadatas.append(
                        ColumnMetadata(
                            name=column_name,
                            description=column_description,
                            col_type=column_type,
                            sort_order=column_sort_order,
                            is_partition_column=is_partition_column,
                        ))

                yield TableMetadata(
                    database=result['database'],
                    cluster=result['cluster'],
                    schema=result['schema'],
                    name=result['name'],
                    description=result['description'],
                    columns=column_metadatas,
                    is_view=result['is_view'],
                    tags=result['tags'],
                )
예제 #14
0
def test_load_no_catalog(patched_config):
    record = TableMetadata(
        database='mock_database',
        cluster=None,
        schema='mock_schema',
        name='mock_table',
        markdown_blob='Test',
    )
    loader = whale_loader.WhaleLoader()
    loader.init(patched_config)
    loader.load(record)

    loader.close()
    file_path = os.path.join(patched_config.get("base_directory"),
                             'mock_database/mock_schema.mock_table.md')
    with open(file_path, 'r') as f:
        written_record = f.read()

    assert record.markdown_blob in written_record
예제 #15
0
def test_load_no_catalog(patched_config):
    record = TableMetadata(
        database="mock_database",
        cluster=None,
        schema="mock_schema",
        name="mock_table",
    )
    loader = whale_loader.WhaleLoader()
    loader.init(patched_config)
    loader.load(record)

    loader.close()
    file_path = os.path.join(patched_config.get("base_directory"),
                             "mock_database/mock_schema.mock_table.md")
    with open(file_path, "r") as f:
        written_record = f.read()

    assert "mock_schema" in written_record
    assert "mock_table" in written_record
    assert "mock_database" in written_record
예제 #16
0
def test_load_catalog_specified(patched_config):
    record = TableMetadata(
        database="mock_database",
        cluster="mock_catalog",
        schema="mock_schema",
        name="mock_table",
        markdown_blob="Test",
    )
    loader = whale_loader.WhaleLoader()
    loader.init(patched_config)
    loader.load(record)

    loader.close()
    file_path = os.path.join(
        patched_config.get("base_directory"),
        "mock_database/mock_catalog.mock_schema.mock_table.md",
    )
    with open(file_path, "r") as f:
        written_record = f.read()

    assert record.markdown_blob in written_record
    def _get_extract_iter(self):
        # type: () -> Iterator[TableMetadata]
        """
        Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata
        :return:
        """

        with self.database.snapshot() as snapshot:
            results = snapshot.execute_sql(self.sql_stmt)
            header = SpannerMetadataExtractor.HEADER
            headered_results = [
                dict(zip(header, result)) for result in results
            ]

            for _, group in groupby(headered_results, self._get_table_key):
                columns = []

                for row in group:
                    last_row = row
                    columns.append(
                        ColumnMetadata(
                            row["col_name"],
                            None,
                            row["col_type"],
                            row["col_sort_order"],
                        ))

                schema = "{}.{}".format(self._instance_id, self._database_id)

                yield TableMetadata(
                    database=self._connection_name or "spanner",
                    cluster=self._project_id,
                    schema=schema,
                    name=last_row["name"],
                    description=None,
                    columns=columns,
                )
    def test_extraction_with_multiple_result(self) -> None:
        with patch.object(SQLAlchemyExtractor,
                          "_get_connection") as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute
            table = {
                "schema": "test_schema1",
                "name": "test_table1",
                "description": "test table 1",
                "is_view": 0,
                "cluster": self.conf[PostgresMetadataExtractor.CLUSTER_KEY],
            }

            table1 = {
                "schema": "test_schema1",
                "name": "test_table2",
                "description": "test table 2",
                "is_view": 0,
                "cluster": self.conf[PostgresMetadataExtractor.CLUSTER_KEY],
            }

            table2 = {
                "schema": "test_schema2",
                "name": "test_table3",
                "description": "test table 3",
                "is_view": 0,
                "cluster": self.conf[PostgresMetadataExtractor.CLUSTER_KEY],
            }

            sql_execute.return_value = [
                self._union(
                    {
                        "col_name": "col_id1",
                        "data_type": "bigint",
                        "col_description": "description of col_id1",
                        "col_sort_order": 0,
                    },
                    table,
                ),
                self._union(
                    {
                        "col_name": "col_id2",
                        "data_type": "bigint",
                        "col_description": "description of col_id2",
                        "col_sort_order": 1,
                    },
                    table,
                ),
                self._union(
                    {
                        "col_name": "is_active",
                        "data_type": "boolean",
                        "col_description": None,
                        "col_sort_order": 2,
                    },
                    table,
                ),
                self._union(
                    {
                        "col_name": "source",
                        "data_type": "varchar",
                        "col_description": "description of source",
                        "col_sort_order": 3,
                    },
                    table,
                ),
                self._union(
                    {
                        "col_name": "etl_created_at",
                        "data_type": "timestamp",
                        "col_description": "description of etl_created_at",
                        "col_sort_order": 4,
                    },
                    table,
                ),
                self._union(
                    {
                        "col_name": "ds",
                        "data_type": "varchar",
                        "col_description": None,
                        "col_sort_order": 5,
                    },
                    table,
                ),
                self._union(
                    {
                        "col_name": "col_name",
                        "data_type": "varchar",
                        "col_description": "description of col_name",
                        "col_sort_order": 0,
                    },
                    table1,
                ),
                self._union(
                    {
                        "col_name": "col_name2",
                        "data_type": "varchar",
                        "col_description": "description of col_name2",
                        "col_sort_order": 1,
                    },
                    table1,
                ),
                self._union(
                    {
                        "col_name": "col_id3",
                        "data_type": "varchar",
                        "col_description": "description of col_id3",
                        "col_sort_order": 0,
                    },
                    table2,
                ),
                self._union(
                    {
                        "col_name": "col_name3",
                        "data_type": "varchar",
                        "col_description": "description of col_name3",
                        "col_sort_order": 1,
                    },
                    table2,
                ),
            ]

            extractor = PostgresMetadataExtractor()
            extractor.init(self.conf)

            expected = TableMetadata(
                "postgres",
                self.conf[PostgresMetadataExtractor.CLUSTER_KEY],
                "test_schema1",
                "test_table1",
                "test table 1",
                [
                    ColumnMetadata("col_id1", "description of col_id1",
                                   "bigint", 0),
                    ColumnMetadata("col_id2", "description of col_id2",
                                   "bigint", 1),
                    ColumnMetadata("is_active", None, "boolean", 2),
                    ColumnMetadata("source", "description of source",
                                   "varchar", 3),
                    ColumnMetadata(
                        "etl_created_at",
                        "description of etl_created_at",
                        "timestamp",
                        4,
                    ),
                    ColumnMetadata("ds", None, "varchar", 5),
                ],
                0,
            )
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            expected = TableMetadata(
                "postgres",
                self.conf[PostgresMetadataExtractor.CLUSTER_KEY],
                "test_schema1",
                "test_table2",
                "test table 2",
                [
                    ColumnMetadata("col_name", "description of col_name",
                                   "varchar", 0),
                    ColumnMetadata("col_name2", "description of col_name2",
                                   "varchar", 1),
                ],
                0,
            )
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            expected = TableMetadata(
                "postgres",
                self.conf[PostgresMetadataExtractor.CLUSTER_KEY],
                "test_schema2",
                "test_table3",
                "test table 3",
                [
                    ColumnMetadata("col_id3", "description of col_id3",
                                   "varchar", 0),
                    ColumnMetadata("col_name3", "description of col_name3",
                                   "varchar", 1),
                ],
                0,
            )
            self.assertEqual(expected.__repr__(),
                             extractor.extract().__repr__())

            self.assertIsNone(extractor.extract())
            self.assertIsNone(extractor.extract())
예제 #19
0
    def _retrieve_tables(self, dataset) -> Any:
        for page in self._page_table_list_results(dataset):
            if "tables" not in page:
                continue

            for table in page["tables"]:
                tableRef = table["tableReference"]
                table_id = tableRef["tableId"]

                # BigQuery tables that have 8 digits as last characters are
                # considered date range tables and are grouped together in the
                # UI. (e.g. ga_sessions_20190101, ga_sessions_20190102, etc.)

                if self._is_sharded_table(table_id):
                    # If the last eight characters are digits, we assume the
                    # table is of a table date range type and then we only need
                    # one schema definition
                    table_prefix = table_id[:-BigQueryMetadataExtractor.
                                            DATE_LENGTH]
                    if table_prefix in self.grouped_tables:
                        # If one table in the date range is processed, then
                        # ignore other ones (it adds too much metadata)
                        continue

                    table_id = table_prefix
                    self.grouped_tables.add(table_prefix)

                table = (self.bigquery_service.tables().get(
                    projectId=tableRef["projectId"],
                    datasetId=tableRef["datasetId"],
                    tableId=tableRef["tableId"],
                ).execute(num_retries=BigQueryMetadataExtractor.NUM_RETRIES))

                tags_dict = None
                try:
                    # Fetch entry for given linked_resource
                    entry = self.datacatalog_service.lookup_entry(
                        request={
                            "linked_resource":
                            f"//bigquery.googleapis.com/projects/{tableRef['projectId']}/datasets/{tableRef['datasetId']}/tables/{tableRef['tableId']}"
                        })
                    if not isinstance(entry, dict):
                        entry_json = entry.__class__.to_json(entry)
                        entry = json.loads(entry_json)

                    # Fetch tags for given entry
                    tags = self.datacatalog_service.list_tags(
                        request={"parent": entry["name"]})
                    tags_dict = dict(tags)
                except Exception as e:
                    LOGGER.warning(
                        f"Error fetching tags from Data Catalog: {e}")

                cols = []
                if self._is_table_match_regex(tableRef):
                    # BigQuery tables also have interesting metadata about
                    # partitioning data location (EU/US), mod/create time, etc...
                    # Extract that some other time?
                    # Not all tables have schemas
                    if "schema" in table:
                        schema = table["schema"]
                        if "fields" in schema:
                            total_cols = 0
                            for column in schema["fields"]:
                                total_cols = self._iterate_over_cols(
                                    tags_dict, "", column, cols,
                                    total_cols + 1)

                table_tag = None
                if tags_dict and "tags" in tags_dict:
                    for tag in tags_dict["tags"]:
                        if "column" not in tag:
                            table_tag = tag

                table_meta = TableMetadata(
                    database=self._database,
                    cluster=tableRef["projectId"],
                    schema=tableRef["datasetId"],
                    name=table_id,
                    description=table.get("description", ""),
                    columns=cols,
                    is_view=table["type"] == "VIEW",
                    tags=table_tag,
                    labels=table.get("labels", ""),
                )

                yield (table_meta)
    def test_extraction_with_single_result(self):
        # type: () -> None
        with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute
            table = {'schema': 'test_schema',
                     'name': 'test_table',
                     'description': 'a table for testing',
                     'cluster':
                     self.conf[SnowflakeMetadataExtractor.CLUSTER_KEY],
                     'is_view': 'false'
                     }

            sql_execute.return_value = [
                self._union(
                    {'col_name': 'col_id1',
                     'col_type': 'number',
                     'col_description': 'description of id1',
                     'col_sort_order': 0}, table),
                self._union(
                    {'col_name': 'col_id2',
                     'col_type': 'number',
                     'col_description': 'description of id2',
                     'col_sort_order': 1}, table),
                self._union(
                    {'col_name': 'is_active',
                     'col_type': 'boolean',
                     'col_description': None,
                     'col_sort_order': 2}, table),
                self._union(
                    {'col_name': 'source',
                     'col_type': 'varchar',
                     'col_description': 'description of source',
                     'col_sort_order': 3}, table),
                self._union(
                    {'col_name': 'etl_created_at',
                     'col_type': 'timestamp_ltz',
                     'col_description': 'description of etl_created_at',
                     'col_sort_order': 4}, table),
                self._union(
                    {'col_name': 'ds',
                     'col_type': 'varchar',
                     'col_description': None,
                     'col_sort_order': 5}, table)
            ]

            extractor = SnowflakeMetadataExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = TableMetadata('prod', 'MY_CLUSTER', 'test_schema', 'test_table', 'a table for testing',
                                     [ColumnMetadata('col_id1', 'description of id1', 'number', 0),
                                      ColumnMetadata('col_id2', 'description of id2', 'number', 1),
                                      ColumnMetadata('is_active', None, 'boolean', 2),
                                      ColumnMetadata('source', 'description of source', 'varchar', 3),
                                      ColumnMetadata('etl_created_at', 'description of etl_created_at',
                                                     'timestamp_ltz', 4),
                                      ColumnMetadata('ds', None, 'varchar', 5)])

            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
예제 #21
0
    def test_extraction_with_single_result(self) -> None:
        with patch.object(GlueExtractor, "_search_tables") as mock_search:
            mock_search.return_value = [{
                "Name":
                "test_catalog_test_schema_test_table",
                "DatabaseName":
                "test_database",
                "Description":
                "a table for testing",
                "StorageDescriptor": {
                    "Columns": [
                        {
                            "Name": "col_id1",
                            "Type": "bigint",
                            "Comment": "description of id1",
                        },
                        {
                            "Name": "col_id2",
                            "Type": "bigint",
                            "Comment": "description of id2",
                        },
                        {
                            "Name": "is_active",
                            "Type": "boolean"
                        },
                        {
                            "Name": "source",
                            "Type": "varchar",
                            "Comment": "description of source",
                        },
                        {
                            "Name": "etl_created_at",
                            "Type": "timestamp",
                            "Comment": "description of etl_created_at",
                        },
                        {
                            "Name": "ds",
                            "Type": "varchar"
                        },
                    ],
                    "Location":
                    "test_catalog.test_schema.test_table",
                },
                "PartitionKeys": [
                    {
                        "Name": "partition_key1",
                        "Type": "string",
                        "Comment": "description of partition_key1",
                    },
                ],
                "TableType":
                "EXTERNAL_TABLE",
            }]

            extractor = GlueExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = TableMetadata(
                "test_database",
                None,
                None,
                "test_catalog_test_schema_test_table",
                "a table for testing",
                [
                    ColumnMetadata("col_id1", "description of id1", "bigint",
                                   0),
                    ColumnMetadata("col_id2", "description of id2", "bigint",
                                   1),
                    ColumnMetadata("is_active", None, "boolean", 2),
                    ColumnMetadata("source", "description of source",
                                   "varchar", 3),
                    ColumnMetadata(
                        "etl_created_at",
                        "description of etl_created_at",
                        "timestamp",
                        4,
                    ),
                    ColumnMetadata("ds", None, "varchar", 5),
                    ColumnMetadata("partition_key1",
                                   "description of partition_key1", "string",
                                   6),
                ],
                False,
            )
            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
예제 #22
0
    def test_extraction_with_single_result(self):
        # type: () -> None
        with patch.object(SQLAlchemyExtractor,
                          "_get_connection") as mock_connection:
            connection = MagicMock()
            mock_connection.return_value = connection
            sql_execute = MagicMock()
            connection.execute = sql_execute
            table = {
                "schema": "test_schema",
                "name": "test_table",
                "description": "a table for testing",
                "cluster": self.conf[SnowflakeMetadataExtractor.CLUSTER_KEY],
                "is_view": "false",
            }

            sql_execute.return_value = [
                self._union(
                    {
                        "col_name": "col_id1",
                        "data_type": "number",
                        "col_description": "description of id1",
                        "col_sort_order": 0,
                    },
                    table,
                ),
                self._union(
                    {
                        "col_name": "col_id2",
                        "data_type": "number",
                        "col_description": "description of id2",
                        "col_sort_order": 1,
                    },
                    table,
                ),
                self._union(
                    {
                        "col_name": "is_active",
                        "data_type": "boolean",
                        "col_description": None,
                        "col_sort_order": 2,
                    },
                    table,
                ),
                self._union(
                    {
                        "col_name": "source",
                        "data_type": "varchar",
                        "col_description": "description of source",
                        "col_sort_order": 3,
                    },
                    table,
                ),
                self._union(
                    {
                        "col_name": "etl_created_at",
                        "data_type": "timestamp_ltz",
                        "col_description": "description of etl_created_at",
                        "col_sort_order": 4,
                    },
                    table,
                ),
                self._union(
                    {
                        "col_name": "ds",
                        "data_type": "varchar",
                        "col_description": None,
                        "col_sort_order": 5,
                    },
                    table,
                ),
            ]

            extractor = SnowflakeMetadataExtractor()
            extractor.init(self.conf)
            actual = extractor.extract()
            expected = TableMetadata(
                "prod",
                "MY_CLUSTER",
                "test_schema",
                "test_table",
                "a table for testing",
                [
                    ColumnMetadata("col_id1", "description of id1", "number",
                                   0),
                    ColumnMetadata("col_id2", "description of id2", "number",
                                   1),
                    ColumnMetadata("is_active", None, "boolean", 2),
                    ColumnMetadata("source", "description of source",
                                   "varchar", 3),
                    ColumnMetadata(
                        "etl_created_at",
                        "description of etl_created_at",
                        "timestamp_ltz",
                        4,
                    ),
                    ColumnMetadata("ds", None, "varchar", 5),
                ],
            )

            self.assertEqual(expected.__repr__(), actual.__repr__())
            self.assertIsNone(extractor.extract())
예제 #23
0
    def get_all_table_metadata_from_information_schema(
        self,
        cluster: Optional[str] = None,
        where_clause_suffix: str = "",
    ):

        unformatted_query = """
        SELECT
          a.table_catalog AS cluster
          , a.table_schema AS schema
          , a.table_name AS name
          , NULL AS description
          , a.column_name AS col_name
          , a.ordinal_position as col_sort_order
          , IF(a.extra_info = 'partition key', 1, 0) AS is_partition_col
          , a.comment AS col_description
          , a.data_type
          , IF(b.table_name is not null, 1, 0) AS is_view
        FROM {cluster_prefix}information_schema.columns a
        LEFT JOIN {cluster_prefix}information_schema.views b
            ON a.table_catalog = b.table_catalog
            and a.table_schema = b.table_schema
            and a.table_name = b.table_name
        {where_clause_suffix}
        """

        LOGGER.info(
            "Pulling all table metadata in bulk from"
            + "information_schema in cluster name: {}".format(cluster)
        )

        if cluster is not None:
            cluster_prefix = cluster + "."
        else:
            cluster_prefix = ""

        formatted_query = unformatted_query.format(
            cluster_prefix=cluster_prefix, where_clause_suffix=where_clause_suffix
        )

        LOGGER.info("SQL for presto: {}".format(formatted_query))

        query_results = self.execute(formatted_query, is_dict_return_enabled=True)

        for _, group in groupby(query_results, self._get_table_key):
            columns = []
            for row in group:
                last_row = row
                columns.append(
                    ColumnMetadata(
                        row["col_name"],
                        row["col_description"],
                        row["data_type"],
                        row["col_sort_order"],
                    )
                )

            yield TableMetadata(
                self._database,
                cluster or self._default_cluster_name,
                last_row["schema"],
                last_row["name"],
                last_row["description"],
                columns,
                is_view=bool(last_row["is_view"]),
            )