def _get_extract_iter(self) -> Iterator[TableMetadata]: for row in self._get_raw_extract_iter(): columns, i = [], 0 for column in row["StorageDescriptor"]["Columns"] + row.get( "PartitionKeys", []): columns.append( ColumnMetadata( column["Name"], column["Comment"] if "Comment" in column else None, column["Type"], i, )) i += 1 catalog, schema, table = self._parse_location( location=row["StorageDescriptor"]["Location"], name=row["Name"]) if self._connection_name: database = self._connection_name + "/" + row["DatabaseName"] else: database = row["DatabaseName"] yield TableMetadata( database, catalog, schema, table, row.get("Description") or row.get("Parameters", {}).get("comment"), columns, row.get("TableType") == "VIRTUAL_VIEW", )
def _get_extract_iter(self) -> Iterator[TableMetadata]: """ Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata :return: """ for _, group in groupby(self._get_raw_extract_iter(), self._get_table_key): columns = [] for row in group: last_row = row columns.append( ColumnMetadata( name=row["column_name"], description=None, col_type=row["column_type"], sort_order=row["column_sort_order"], )) yield TableMetadata( database=self._database, cluster=None, schema=last_row["schema_name"], name=last_row["table_name"], description=None, columns=columns, is_view=last_row["table_type"] == "V", )
def _get_extract_iter(self) -> Iterator[TableMetadata]: """ Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata :return: """ for _, group in groupby(self._get_raw_extract_iter(), self._get_table_key): columns = [] for row in group: column_description = \ unidecode(row['col_description']) \ if row['col_description'] else None last_row = row columns.append( ColumnMetadata(name=row['col_name'], description=column_description, col_type=row['col_type'], sort_order=row['col_sort_order'])) description = \ unidecode(last_row['description']) \ if last_row['description'] else None yield TableMetadata(database=self._database, cluster=last_row['cluster'], schema=last_row['schema'], name=last_row['name'], description=description, columns=columns, is_view=last_row['is_view'] == 'true')
def _get_extract_iter(self) -> Iterator[TableMetadata]: """ Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata :return: """ for key, group in groupby(self._get_raw_extract_iter(), self._get_table_key): columns = [] for row in group: last_row = row columns.append( ColumnMetadata( row["col_name"], row["col_description"], row["data_type"], row["col_sort_order"], ) ) yield TableMetadata( self._database, last_row["cluster"], last_row["schema"], last_row["name"], last_row["description"], columns, )
def _get_extract_iter(self) -> Iterator[TableMetadata]: """ Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata :return: """ for _, group in groupby(self._get_raw_extract_iter(), self._get_table_key): columns = [] for row in group: column_description = (unidecode(row["col_description"]) if row["col_description"] else None) last_row = row columns.append( ColumnMetadata( name=row["col_name"], description=column_description, data_type=row["data_type"], sort_order=row["col_sort_order"], )) description = (unidecode(last_row["description"]) if last_row["description"] else None) yield TableMetadata( database=self._database, cluster=last_row["cluster"], schema=last_row["schema"], name=last_row["name"], description=description, columns=columns, is_view=last_row["is_view"] == "true", )
def setUp(self) -> None: logging.basicConfig(level=logging.INFO) self.database = "mock_database" self.schema = "mock_schema" self.table = "mock_table" tmp_path = tempfile.NamedTemporaryFile() self.table_stub_path = os.path.join(tmp_path.name, self.database, f"{self.schema}.{self.table}.md") self.loader_config = ConfigFactory.from_dict({ "table_stub_paths": [self.table_stub_path], "base_directory": tmp_path.name, }) self.extractor_config = ConfigFactory.from_dict({ MetricRunner.DATABASE_KEY: self.database, "table_stub_paths": [self.table_stub_path], }) self.record = TableMetadata( database=self.database, cluster=None, schema=self.schema, name=self.table, )
def test_transformed_record_contains_components(self): """""" column = ColumnMetadata( name=COLUMN, col_type="Integer", sort_order=0, description=COLUMN_DESCRIPTION, ) record = TableMetadata( database=DATABASE, cluster=CLUSTER, schema=SCHEMA, name=TABLE, columns=[column], ) components = [ DATABASE, CLUSTER, SCHEMA, TABLE, COLUMN, COLUMN_DESCRIPTION, ] transformer = MarkdownTransformer() transformer.init(self._conf) transformed_record = transformer.transform(record) markdown_blob = transformed_record.markdown_blob transformer.close() has_components = all(x in markdown_blob for x in components) self.assertEqual(has_components, True)
def _retrieve_tables(self, dataset) -> Any: for page in self._page_table_list_results(dataset): if "tables" not in page: continue for table in page["tables"]: tableRef = table["tableReference"] table_id = tableRef["tableId"] # BigQuery tables that have 8 digits as last characters are # considered date range tables and are grouped together in the # UI. (e.g. ga_sessions_20190101, ga_sessions_20190102, etc.) if self._is_sharded_table(table_id): # If the last eight characters are digits, we assume the # table is of a table date range type and then we only need # one schema definition table_prefix = table_id[:-BigQueryMetadataExtractor. DATE_LENGTH] if table_prefix in self.grouped_tables: # If one table in the date range is processed, then # ignore other ones (it adds too much metadata) continue table_id = table_prefix self.grouped_tables.add(table_prefix) table = (self.bigquery_service.tables().get( projectId=tableRef["projectId"], datasetId=tableRef["datasetId"], tableId=tableRef["tableId"], ).execute(num_retries=BigQueryMetadataExtractor.NUM_RETRIES)) cols = [] if self._is_table_match_regex(tableRef): # BigQuery tables also have interesting metadata about # partitioning data location (EU/US), mod/create time, etc... # Extract that some other time? # Not all tables have schemas if "schema" in table: schema = table["schema"] if "fields" in schema: total_cols = 0 for column in schema["fields"]: total_cols = self._iterate_over_cols( "", column, cols, total_cols + 1) table_meta = TableMetadata( database="bigquery", cluster=tableRef["projectId"], schema=tableRef["datasetId"], name=table_id, description=table.get("description", ""), columns=cols, is_view=table["type"] == "VIEW", ) yield (table_meta)
def get_table_metadata( self, schema: str, table: str, cluster: Optional[str] = None, is_view_query_enabled: Optional[bool] = False, ): # Format table and schema addresses for queries. full_schema_address = self._get_full_schema_address(cluster, schema) full_table_address = "{}.{}".format(full_schema_address, table) # Execute query that gets column type + partition information. columns_query = "show columns in {}".format(full_table_address) column_query_results = self.execute(columns_query, has_header=True) column_query_field_names = next(column_query_results) columns = [] for i, column_query_result in enumerate(column_query_results): column_dict = dict(zip(column_query_field_names, column_query_result)) columns.append( ColumnMetadata( name=column_dict["Column"], description=column_dict["Comment"], data_type=column_dict["Type"], sort_order=i, is_partition_column=column_dict["Extra"] == "partition key", ) ) if is_view_query_enabled: # Execute query that returns if table is a view. view_query = """ select table_type from information_schema.tables where table_schema='{table_schema}' and table_name='{table_name}' """.format( table_schema=schema, table_name=table ) view_query_results = self.execute(view_query, has_header=False) is_view = next(view_query_results)[0] == "VIEW" else: is_view = False return TableMetadata( database=self._database, cluster=cluster, schema=schema, name=table, description=None, columns=columns, is_view=is_view, )
def _get_extract_iter(self): with self.driver.session() as session: if not hasattr(self, "results"): self.results = session.read_transaction(self._execute_query) for result in self.results: # Parse watermark information. partition_columns = [] for watermark in result["watermarks"]: partition_columns.append(watermark["partition_key"]) # Parse column information. column_names = result["column_names"] column_descriptions = result["column_descriptions"] column_types = result["column_types"] column_sort_orders = result["column_sort_orders"] zipped_columns = zip_longest(column_names, column_descriptions, column_types, column_sort_orders) column_metadatas = [] for ( column_name, column_description, column_type, column_sort_order, ) in zipped_columns: if column_name in partition_columns: is_partition_column = True else: is_partition_column = False column_metadatas.append( ColumnMetadata( name=column_name, description=column_description, col_type=column_type, sort_order=column_sort_order, is_partition_column=is_partition_column, )) yield TableMetadata( database=result["database"], cluster=result["cluster"], schema=result["schema"], name=result["name"], description=result["description"], columns=column_metadatas, is_view=result["is_view"], tags=result["tags"], )
def test_extraction_with_single_result(self): with patch.object( splice_machine_metadata_extractor, "splice_connect" ) as mock_connect: column = ColumnMetadata("column1", None, "int", 0) table = TableMetadata( self.DATABASE, self.CLUSTER, "test_schema", "test_table", None, [column], ) # Connection returns a cursor mock_cursor = MagicMock() mock_execute = MagicMock() mock_fetchall = MagicMock() # self.connection = splice_connect(...) mock_connection = MagicMock() mock_connect.return_value = mock_connection # self.cursor = self.connection.cursor() mock_connection.cursor.return_value = mock_cursor # self.cursor.execute(...) mock_cursor.execute = mock_execute # for row in self.cursor.fetchall() mock_cursor.fetchall = mock_fetchall mock_fetchall.return_value = [ [ table.schema, table.name, "not-a-view", column.name, column.sort_order, column.type, ] ] extractor = self.Extractor() extractor.init(self.conf) actual = extractor.extract() expected = table self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())
def test_get_all_table_metadata_from_information_schema( self, mock_settings) -> None: self.engine.init(self.conf) self.engine.execute = MagicMock( side_effect=presto_engine_execute_side_effect) mock_columns = [ ColumnMetadata( name=MOCK_INFORMATION_SCHEMA_RESULT_1["col_name"], description=MOCK_INFORMATION_SCHEMA_RESULT_1[ "col_description"], # noqa: 501 data_type=MOCK_INFORMATION_SCHEMA_RESULT_1["data_type"], sort_order=MOCK_INFORMATION_SCHEMA_RESULT_1["col_sort_order"], is_partition_column=None, ), ColumnMetadata( name=MOCK_INFORMATION_SCHEMA_RESULT_2["col_name"], description=MOCK_INFORMATION_SCHEMA_RESULT_2[ "col_description"], # noqa: 501 data_type=MOCK_INFORMATION_SCHEMA_RESULT_2["data_type"], sort_order=MOCK_INFORMATION_SCHEMA_RESULT_2["col_sort_order"], is_partition_column=None, ), ] expected = TableMetadata( database=MOCK_DATABASE_NAME, cluster=MOCK_CLUSTER_NAME, schema=MOCK_SCHEMA_NAME, name=MOCK_TABLE_NAME, columns=mock_columns, is_view=bool(MOCK_INFORMATION_SCHEMA_RESULT_1["is_view"]), ) results = self.engine.get_all_table_metadata_from_information_schema( cluster=MOCK_CLUSTER_NAME) result = next(results) self.maxDiff = None self.assertEqual(result.__repr__(), expected.__repr__())
def _get_extract_iter(self): with self.driver.session() as session: if not hasattr(self, 'results'): self.results = session.read_transaction(self._execute_query) for result in self.results: # Parse watermark information. partition_columns = [] for watermark in result['watermarks']: partition_columns.append(watermark['partition_key']) # Parse column information. column_names = result['column_names'] column_descriptions = result['column_descriptions'] column_types = result['column_types'] column_sort_orders = result['column_sort_orders'] zipped_columns = zip_longest(column_names, column_descriptions, column_types, column_sort_orders) column_metadatas = [] for column_name, \ column_description, \ column_type, \ column_sort_order \ in zipped_columns: if column_name in partition_columns: is_partition_column = True else: is_partition_column = False column_metadatas.append( ColumnMetadata( name=column_name, description=column_description, col_type=column_type, sort_order=column_sort_order, is_partition_column=is_partition_column, )) yield TableMetadata( database=result['database'], cluster=result['cluster'], schema=result['schema'], name=result['name'], description=result['description'], columns=column_metadatas, is_view=result['is_view'], tags=result['tags'], )
def test_load_no_catalog(patched_config): record = TableMetadata( database='mock_database', cluster=None, schema='mock_schema', name='mock_table', markdown_blob='Test', ) loader = whale_loader.WhaleLoader() loader.init(patched_config) loader.load(record) loader.close() file_path = os.path.join(patched_config.get("base_directory"), 'mock_database/mock_schema.mock_table.md') with open(file_path, 'r') as f: written_record = f.read() assert record.markdown_blob in written_record
def test_load_no_catalog(patched_config): record = TableMetadata( database="mock_database", cluster=None, schema="mock_schema", name="mock_table", ) loader = whale_loader.WhaleLoader() loader.init(patched_config) loader.load(record) loader.close() file_path = os.path.join(patched_config.get("base_directory"), "mock_database/mock_schema.mock_table.md") with open(file_path, "r") as f: written_record = f.read() assert "mock_schema" in written_record assert "mock_table" in written_record assert "mock_database" in written_record
def test_load_catalog_specified(patched_config): record = TableMetadata( database="mock_database", cluster="mock_catalog", schema="mock_schema", name="mock_table", markdown_blob="Test", ) loader = whale_loader.WhaleLoader() loader.init(patched_config) loader.load(record) loader.close() file_path = os.path.join( patched_config.get("base_directory"), "mock_database/mock_catalog.mock_schema.mock_table.md", ) with open(file_path, "r") as f: written_record = f.read() assert record.markdown_blob in written_record
def _get_extract_iter(self): # type: () -> Iterator[TableMetadata] """ Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata :return: """ with self.database.snapshot() as snapshot: results = snapshot.execute_sql(self.sql_stmt) header = SpannerMetadataExtractor.HEADER headered_results = [ dict(zip(header, result)) for result in results ] for _, group in groupby(headered_results, self._get_table_key): columns = [] for row in group: last_row = row columns.append( ColumnMetadata( row["col_name"], None, row["col_type"], row["col_sort_order"], )) schema = "{}.{}".format(self._instance_id, self._database_id) yield TableMetadata( database=self._connection_name or "spanner", cluster=self._project_id, schema=schema, name=last_row["name"], description=None, columns=columns, )
def test_extraction_with_multiple_result(self) -> None: with patch.object(SQLAlchemyExtractor, "_get_connection") as mock_connection: connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() connection.execute = sql_execute table = { "schema": "test_schema1", "name": "test_table1", "description": "test table 1", "is_view": 0, "cluster": self.conf[PostgresMetadataExtractor.CLUSTER_KEY], } table1 = { "schema": "test_schema1", "name": "test_table2", "description": "test table 2", "is_view": 0, "cluster": self.conf[PostgresMetadataExtractor.CLUSTER_KEY], } table2 = { "schema": "test_schema2", "name": "test_table3", "description": "test table 3", "is_view": 0, "cluster": self.conf[PostgresMetadataExtractor.CLUSTER_KEY], } sql_execute.return_value = [ self._union( { "col_name": "col_id1", "data_type": "bigint", "col_description": "description of col_id1", "col_sort_order": 0, }, table, ), self._union( { "col_name": "col_id2", "data_type": "bigint", "col_description": "description of col_id2", "col_sort_order": 1, }, table, ), self._union( { "col_name": "is_active", "data_type": "boolean", "col_description": None, "col_sort_order": 2, }, table, ), self._union( { "col_name": "source", "data_type": "varchar", "col_description": "description of source", "col_sort_order": 3, }, table, ), self._union( { "col_name": "etl_created_at", "data_type": "timestamp", "col_description": "description of etl_created_at", "col_sort_order": 4, }, table, ), self._union( { "col_name": "ds", "data_type": "varchar", "col_description": None, "col_sort_order": 5, }, table, ), self._union( { "col_name": "col_name", "data_type": "varchar", "col_description": "description of col_name", "col_sort_order": 0, }, table1, ), self._union( { "col_name": "col_name2", "data_type": "varchar", "col_description": "description of col_name2", "col_sort_order": 1, }, table1, ), self._union( { "col_name": "col_id3", "data_type": "varchar", "col_description": "description of col_id3", "col_sort_order": 0, }, table2, ), self._union( { "col_name": "col_name3", "data_type": "varchar", "col_description": "description of col_name3", "col_sort_order": 1, }, table2, ), ] extractor = PostgresMetadataExtractor() extractor.init(self.conf) expected = TableMetadata( "postgres", self.conf[PostgresMetadataExtractor.CLUSTER_KEY], "test_schema1", "test_table1", "test table 1", [ ColumnMetadata("col_id1", "description of col_id1", "bigint", 0), ColumnMetadata("col_id2", "description of col_id2", "bigint", 1), ColumnMetadata("is_active", None, "boolean", 2), ColumnMetadata("source", "description of source", "varchar", 3), ColumnMetadata( "etl_created_at", "description of etl_created_at", "timestamp", 4, ), ColumnMetadata("ds", None, "varchar", 5), ], 0, ) self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) expected = TableMetadata( "postgres", self.conf[PostgresMetadataExtractor.CLUSTER_KEY], "test_schema1", "test_table2", "test table 2", [ ColumnMetadata("col_name", "description of col_name", "varchar", 0), ColumnMetadata("col_name2", "description of col_name2", "varchar", 1), ], 0, ) self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) expected = TableMetadata( "postgres", self.conf[PostgresMetadataExtractor.CLUSTER_KEY], "test_schema2", "test_table3", "test table 3", [ ColumnMetadata("col_id3", "description of col_id3", "varchar", 0), ColumnMetadata("col_name3", "description of col_name3", "varchar", 1), ], 0, ) self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) self.assertIsNone(extractor.extract()) self.assertIsNone(extractor.extract())
def _retrieve_tables(self, dataset) -> Any: for page in self._page_table_list_results(dataset): if "tables" not in page: continue for table in page["tables"]: tableRef = table["tableReference"] table_id = tableRef["tableId"] # BigQuery tables that have 8 digits as last characters are # considered date range tables and are grouped together in the # UI. (e.g. ga_sessions_20190101, ga_sessions_20190102, etc.) if self._is_sharded_table(table_id): # If the last eight characters are digits, we assume the # table is of a table date range type and then we only need # one schema definition table_prefix = table_id[:-BigQueryMetadataExtractor. DATE_LENGTH] if table_prefix in self.grouped_tables: # If one table in the date range is processed, then # ignore other ones (it adds too much metadata) continue table_id = table_prefix self.grouped_tables.add(table_prefix) table = (self.bigquery_service.tables().get( projectId=tableRef["projectId"], datasetId=tableRef["datasetId"], tableId=tableRef["tableId"], ).execute(num_retries=BigQueryMetadataExtractor.NUM_RETRIES)) tags_dict = None try: # Fetch entry for given linked_resource entry = self.datacatalog_service.lookup_entry( request={ "linked_resource": f"//bigquery.googleapis.com/projects/{tableRef['projectId']}/datasets/{tableRef['datasetId']}/tables/{tableRef['tableId']}" }) if not isinstance(entry, dict): entry_json = entry.__class__.to_json(entry) entry = json.loads(entry_json) # Fetch tags for given entry tags = self.datacatalog_service.list_tags( request={"parent": entry["name"]}) tags_dict = dict(tags) except Exception as e: LOGGER.warning( f"Error fetching tags from Data Catalog: {e}") cols = [] if self._is_table_match_regex(tableRef): # BigQuery tables also have interesting metadata about # partitioning data location (EU/US), mod/create time, etc... # Extract that some other time? # Not all tables have schemas if "schema" in table: schema = table["schema"] if "fields" in schema: total_cols = 0 for column in schema["fields"]: total_cols = self._iterate_over_cols( tags_dict, "", column, cols, total_cols + 1) table_tag = None if tags_dict and "tags" in tags_dict: for tag in tags_dict["tags"]: if "column" not in tag: table_tag = tag table_meta = TableMetadata( database=self._database, cluster=tableRef["projectId"], schema=tableRef["datasetId"], name=table_id, description=table.get("description", ""), columns=cols, is_view=table["type"] == "VIEW", tags=table_tag, labels=table.get("labels", ""), ) yield (table_meta)
def test_extraction_with_single_result(self): # type: () -> None with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection: connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() connection.execute = sql_execute table = {'schema': 'test_schema', 'name': 'test_table', 'description': 'a table for testing', 'cluster': self.conf[SnowflakeMetadataExtractor.CLUSTER_KEY], 'is_view': 'false' } sql_execute.return_value = [ self._union( {'col_name': 'col_id1', 'col_type': 'number', 'col_description': 'description of id1', 'col_sort_order': 0}, table), self._union( {'col_name': 'col_id2', 'col_type': 'number', 'col_description': 'description of id2', 'col_sort_order': 1}, table), self._union( {'col_name': 'is_active', 'col_type': 'boolean', 'col_description': None, 'col_sort_order': 2}, table), self._union( {'col_name': 'source', 'col_type': 'varchar', 'col_description': 'description of source', 'col_sort_order': 3}, table), self._union( {'col_name': 'etl_created_at', 'col_type': 'timestamp_ltz', 'col_description': 'description of etl_created_at', 'col_sort_order': 4}, table), self._union( {'col_name': 'ds', 'col_type': 'varchar', 'col_description': None, 'col_sort_order': 5}, table) ] extractor = SnowflakeMetadataExtractor() extractor.init(self.conf) actual = extractor.extract() expected = TableMetadata('prod', 'MY_CLUSTER', 'test_schema', 'test_table', 'a table for testing', [ColumnMetadata('col_id1', 'description of id1', 'number', 0), ColumnMetadata('col_id2', 'description of id2', 'number', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp_ltz', 4), ColumnMetadata('ds', None, 'varchar', 5)]) self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())
def test_extraction_with_single_result(self) -> None: with patch.object(GlueExtractor, "_search_tables") as mock_search: mock_search.return_value = [{ "Name": "test_catalog_test_schema_test_table", "DatabaseName": "test_database", "Description": "a table for testing", "StorageDescriptor": { "Columns": [ { "Name": "col_id1", "Type": "bigint", "Comment": "description of id1", }, { "Name": "col_id2", "Type": "bigint", "Comment": "description of id2", }, { "Name": "is_active", "Type": "boolean" }, { "Name": "source", "Type": "varchar", "Comment": "description of source", }, { "Name": "etl_created_at", "Type": "timestamp", "Comment": "description of etl_created_at", }, { "Name": "ds", "Type": "varchar" }, ], "Location": "test_catalog.test_schema.test_table", }, "PartitionKeys": [ { "Name": "partition_key1", "Type": "string", "Comment": "description of partition_key1", }, ], "TableType": "EXTERNAL_TABLE", }] extractor = GlueExtractor() extractor.init(self.conf) actual = extractor.extract() expected = TableMetadata( "test_database", None, None, "test_catalog_test_schema_test_table", "a table for testing", [ ColumnMetadata("col_id1", "description of id1", "bigint", 0), ColumnMetadata("col_id2", "description of id2", "bigint", 1), ColumnMetadata("is_active", None, "boolean", 2), ColumnMetadata("source", "description of source", "varchar", 3), ColumnMetadata( "etl_created_at", "description of etl_created_at", "timestamp", 4, ), ColumnMetadata("ds", None, "varchar", 5), ColumnMetadata("partition_key1", "description of partition_key1", "string", 6), ], False, ) self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())
def test_extraction_with_single_result(self): # type: () -> None with patch.object(SQLAlchemyExtractor, "_get_connection") as mock_connection: connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() connection.execute = sql_execute table = { "schema": "test_schema", "name": "test_table", "description": "a table for testing", "cluster": self.conf[SnowflakeMetadataExtractor.CLUSTER_KEY], "is_view": "false", } sql_execute.return_value = [ self._union( { "col_name": "col_id1", "data_type": "number", "col_description": "description of id1", "col_sort_order": 0, }, table, ), self._union( { "col_name": "col_id2", "data_type": "number", "col_description": "description of id2", "col_sort_order": 1, }, table, ), self._union( { "col_name": "is_active", "data_type": "boolean", "col_description": None, "col_sort_order": 2, }, table, ), self._union( { "col_name": "source", "data_type": "varchar", "col_description": "description of source", "col_sort_order": 3, }, table, ), self._union( { "col_name": "etl_created_at", "data_type": "timestamp_ltz", "col_description": "description of etl_created_at", "col_sort_order": 4, }, table, ), self._union( { "col_name": "ds", "data_type": "varchar", "col_description": None, "col_sort_order": 5, }, table, ), ] extractor = SnowflakeMetadataExtractor() extractor.init(self.conf) actual = extractor.extract() expected = TableMetadata( "prod", "MY_CLUSTER", "test_schema", "test_table", "a table for testing", [ ColumnMetadata("col_id1", "description of id1", "number", 0), ColumnMetadata("col_id2", "description of id2", "number", 1), ColumnMetadata("is_active", None, "boolean", 2), ColumnMetadata("source", "description of source", "varchar", 3), ColumnMetadata( "etl_created_at", "description of etl_created_at", "timestamp_ltz", 4, ), ColumnMetadata("ds", None, "varchar", 5), ], ) self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())
def get_all_table_metadata_from_information_schema( self, cluster: Optional[str] = None, where_clause_suffix: str = "", ): unformatted_query = """ SELECT a.table_catalog AS cluster , a.table_schema AS schema , a.table_name AS name , NULL AS description , a.column_name AS col_name , a.ordinal_position as col_sort_order , IF(a.extra_info = 'partition key', 1, 0) AS is_partition_col , a.comment AS col_description , a.data_type , IF(b.table_name is not null, 1, 0) AS is_view FROM {cluster_prefix}information_schema.columns a LEFT JOIN {cluster_prefix}information_schema.views b ON a.table_catalog = b.table_catalog and a.table_schema = b.table_schema and a.table_name = b.table_name {where_clause_suffix} """ LOGGER.info( "Pulling all table metadata in bulk from" + "information_schema in cluster name: {}".format(cluster) ) if cluster is not None: cluster_prefix = cluster + "." else: cluster_prefix = "" formatted_query = unformatted_query.format( cluster_prefix=cluster_prefix, where_clause_suffix=where_clause_suffix ) LOGGER.info("SQL for presto: {}".format(formatted_query)) query_results = self.execute(formatted_query, is_dict_return_enabled=True) for _, group in groupby(query_results, self._get_table_key): columns = [] for row in group: last_row = row columns.append( ColumnMetadata( row["col_name"], row["col_description"], row["data_type"], row["col_sort_order"], ) ) yield TableMetadata( self._database, cluster or self._default_cluster_name, last_row["schema"], last_row["name"], last_row["description"], columns, is_view=bool(last_row["is_view"]), )