def test_multiple_results(self, mock_client): col1_name = "col1" col2_name = "col2" col1_type = "int" col2_type = "char" col1_sort_order = "1" col2_sort_order = "2" mock_client.return_value.instance.return_value.database.return_value.snapshot.return_value.__enter__.return_value.execute_sql.return_value = [ [col1_name, col1_type, col1_sort_order, self.schema, self.table], [col2_name, col2_type, col2_sort_order, self.schema, self.table], ] extractor = SpannerMetadataExtractor() extractor.init( Scoped.get_scoped_conf(conf=self.conf, scope=extractor.get_scope())) result = extractor.extract() assert result.database == self.connection_name assert result.cluster == self.project_id assert result.schema == f"{self.instance_id}.{self.database_id}" assert result.name == self.table self.assertEqual( result.columns[0].__repr__(), ColumnMetadata(col1_name, None, col1_type, col1_sort_order, None).__repr__(), ) self.assertEqual( result.columns[1].__repr__(), ColumnMetadata(col2_name, None, col2_type, col2_sort_order, None).__repr__(), )
def test_format_for_markdown(self): table_metadata = TableMetadata( database='test_database', cluster='test_cluster', schema='test_schema', name='test_table', columns=[ ColumnMetadata( name='test_column_1', description=None, data_type='INTEGER', sort_order=1, ), ColumnMetadata( name='test_column_2', description=None, data_type='BOOLEAN', sort_order=2, ), ], ) expected = """# `test_schema.test_table` `test_database` | `test_cluster` ## Column details * [INTEGER] `test_column_1` * [BOOLEAN] `test_column_2` """ self.assertEqual(table_metadata.format_for_markdown(), expected)
def _get_extract_iter(self) -> Iterator[TableMetadata]: """ Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata :return: """ for _, group in groupby(self._get_raw_extract_iter(), self._get_table_key): columns = [] for row in group: column_description = (unidecode(row["col_description"]) if row["col_description"] else None) last_row = row columns.append( ColumnMetadata( name=row["col_name"], description=column_description, data_type=row["data_type"], sort_order=row["col_sort_order"], )) description = (unidecode(last_row["description"]) if last_row["description"] else None) yield TableMetadata( database=self._database, cluster=last_row["cluster"], schema=last_row["schema"], name=last_row["name"], description=description, columns=columns, is_view=last_row["is_view"] == "true", )
def _get_extract_iter(self): # type: () -> Iterator[TableMetadata] """ Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata :return: """ for _, group in groupby(self._get_raw_extract_iter(), self._get_table_key): columns = [] for row in group: last_row = row columns.append( ColumnMetadata( row["col_name"], row["col_description"], row["data_type"], row["col_sort_order"], )) yield TableMetadata( self._database, self._cluster, last_row["schema"], last_row["name"], last_row["description"], columns, is_view=bool(last_row["is_view"]), )
def _get_extract_iter(self) -> Iterator[TableMetadata]: """ Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata :return: """ for key, group in groupby(self._get_raw_extract_iter(), self._get_table_key): columns = [] for row in group: last_row = row columns.append( ColumnMetadata( row["col_name"], row["col_description"], row["data_type"], row["col_sort_order"], )) # Deviating from amundsen to add `is_view` yield TableMetadata( self._database, last_row["cluster"], last_row["schema"], last_row["name"], last_row["description"], columns, last_row["is_view"], )
def _iterate_over_cols( self, tags_dict: dict, parent: str, column: str, cols: List[ColumnMetadata], total_cols: int, ) -> int: if len(parent) > 0: col_name = "{parent}.{field}".format(parent=parent, field=column["name"]) else: col_name = column["name"] tags = None if tags_dict and "tags" in tags_dict: for tag in tags_dict["tags"]: if "column" in tag: if tag["column"] == col_name: tags = tag if column["type"] == "RECORD": col = ColumnMetadata( name=col_name, description=column.get("description", ""), data_type=column["type"], sort_order=total_cols, tags=tags, ) cols.append(col) total_cols += 1 for field in column["fields"]: total_cols = self._iterate_over_cols(tags_dict, col_name, field, cols, total_cols) return total_cols else: col = ColumnMetadata( name=col_name, description=column.get("description", ""), data_type=column["type"], sort_order=total_cols, tags=tags, ) cols.append(col) return total_cols + 1
def get_table_metadata( self, schema: str, table: str, cluster: Optional[str] = None, is_view_query_enabled: Optional[bool] = False, ): # Format table and schema addresses for queries. full_schema_address = self._get_full_schema_address(cluster, schema) full_table_address = "{}.{}".format(full_schema_address, table) # Execute query that gets column type + partition information. columns_query = "show columns in {}".format(full_table_address) column_query_results = self.execute(columns_query, has_header=True) column_query_field_names = next(column_query_results) columns = [] for i, column_query_result in enumerate(column_query_results): column_dict = dict(zip(column_query_field_names, column_query_result)) columns.append( ColumnMetadata( name=column_dict["Column"], description=column_dict["Comment"], data_type=column_dict["Type"], sort_order=i, is_partition_column=column_dict["Extra"] == "partition key", ) ) if is_view_query_enabled: # Execute query that returns if table is a view. view_query = """ select table_type from information_schema.tables where table_schema='{table_schema}' and table_name='{table_name}' """.format( table_schema=schema, table_name=table ) view_query_results = self.execute(view_query, has_header=False) is_view = next(view_query_results)[0] == "VIEW" else: is_view = False return TableMetadata( database=self._database, cluster=cluster, schema=schema, name=table, description=None, columns=columns, is_view=is_view, )
def _get_extract_iter(self): with self.driver.session() as session: if not hasattr(self, "results"): self.results = session.read_transaction(self._execute_query) for result in self.results: # Parse watermark information. partition_columns = [] for watermark in result["watermarks"]: partition_columns.append(watermark["partition_key"]) # Parse column information. column_names = result["column_names"] column_descriptions = result["column_descriptions"] column_types = result["column_types"] column_sort_orders = result["column_sort_orders"] zipped_columns = zip_longest(column_names, column_descriptions, column_types, column_sort_orders) column_metadatas = [] for ( column_name, column_description, column_type, column_sort_order, ) in zipped_columns: if column_name in partition_columns: is_partition_column = True else: is_partition_column = False column_metadatas.append( ColumnMetadata( name=column_name, description=column_description, data_type=column_type, sort_order=column_sort_order, is_partition_column=is_partition_column, )) yield TableMetadata( database=result["database"], cluster=result["cluster"], schema=result["schema"], name=result["name"], description=result["description"], columns=column_metadatas, is_view=result["is_view"], tags=result["tags"], )
def test_extraction_with_single_result(self): with patch.object( splice_machine_metadata_extractor, "splice_connect" ) as mock_connect: column = ColumnMetadata("column1", None, "int", 0) table = TableMetadata( self.DATABASE, self.CLUSTER, "test_schema", "test_table", None, [column], ) # Connection returns a cursor mock_cursor = MagicMock() mock_execute = MagicMock() mock_fetchall = MagicMock() # self.connection = splice_connect(...) mock_connection = MagicMock() mock_connect.return_value = mock_connection # self.cursor = self.connection.cursor() mock_connection.cursor.return_value = mock_cursor # self.cursor.execute(...) mock_cursor.execute = mock_execute # for row in self.cursor.fetchall() mock_cursor.fetchall = mock_fetchall mock_fetchall.return_value = [ [ table.schema, table.name, "not-a-view", column.name, column.sort_order, column.type, ] ] extractor = self.Extractor() extractor.init(self.conf) actual = extractor.extract() expected = table self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())
def test_get_all_table_metadata_from_information_schema( self, mock_settings) -> None: self.engine.init(self.conf) self.engine.execute = MagicMock( side_effect=presto_engine_execute_side_effect) mock_columns = [ ColumnMetadata( name=MOCK_INFORMATION_SCHEMA_RESULT_1["col_name"], description=MOCK_INFORMATION_SCHEMA_RESULT_1[ "col_description"], # noqa: 501 data_type=MOCK_INFORMATION_SCHEMA_RESULT_1["data_type"], sort_order=MOCK_INFORMATION_SCHEMA_RESULT_1["col_sort_order"], is_partition_column=None, ), ColumnMetadata( name=MOCK_INFORMATION_SCHEMA_RESULT_2["col_name"], description=MOCK_INFORMATION_SCHEMA_RESULT_2[ "col_description"], # noqa: 501 data_type=MOCK_INFORMATION_SCHEMA_RESULT_2["data_type"], sort_order=MOCK_INFORMATION_SCHEMA_RESULT_2["col_sort_order"], is_partition_column=None, ), ] expected = TableMetadata( database=MOCK_DATABASE_NAME, cluster=MOCK_CLUSTER_NAME, schema=MOCK_SCHEMA_NAME, name=MOCK_TABLE_NAME, columns=mock_columns, is_view=bool(MOCK_INFORMATION_SCHEMA_RESULT_1["is_view"]), ) results = self.engine.get_all_table_metadata_from_information_schema( cluster=MOCK_CLUSTER_NAME) result = next(results) self.maxDiff = None self.assertEqual(result.__repr__(), expected.__repr__())
def _get_extract_iter(self) -> Iterator[TableMetadata]: for row in self._get_raw_extract_iter(): columns, i = [], 0 for column in row["StorageDescriptor"]["Columns"] + row.get( "PartitionKeys", []): columns.append( ColumnMetadata( column["Name"], column["Comment"] if "Comment" in column else None, column["Type"], i, )) i += 1 if self._is_location_parsing_enabled: catalog, schema, table = self._parse_location( location=row["StorageDescriptor"]["Location"], name=row["Name"]) else: catalog = None schema = None table = row["Name"] if self._connection_name: database = self._connection_name + "/" + row["DatabaseName"] else: database = row["DatabaseName"] yield TableMetadata( database, catalog, schema, table, row.get("Description") or row.get("Parameters", {}).get("comment"), columns, row.get("TableType") == "VIRTUAL_VIEW", )
def _get_extract_iter(self): # type: () -> Iterator[TableMetadata] """ Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata :return: """ with self.database.snapshot() as snapshot: results = snapshot.execute_sql(self.sql_stmt) header = SpannerMetadataExtractor.HEADER headered_results = [ dict(zip(header, result)) for result in results ] schema = "{}.{}".format(self._instance_id, self._database_id) for _, group in groupby(headered_results, self._get_table_key): columns = [] for row in group: last_row = row columns.append( ColumnMetadata( row["col_name"], None, row["data_type"], row["col_sort_order"], )) yield TableMetadata( database=self._connection_name or "spanner", cluster=self._project_id, schema=schema, name=last_row["name"], description=None, columns=columns, )
def test_extraction_with_single_result(self): # type: () -> None with patch.object(SQLAlchemyExtractor, "_get_connection") as mock_connection: connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() connection.execute = sql_execute table = { "schema": "test_schema", "name": "test_table", "description": "a table for testing", "cluster": self.conf[SnowflakeMetadataExtractor.CLUSTER_KEY], "is_view": "false", } sql_execute.return_value = [ self._union( { "col_name": "col_id1", "data_type": "number", "col_description": "description of id1", "col_sort_order": 0, }, table, ), self._union( { "col_name": "col_id2", "data_type": "number", "col_description": "description of id2", "col_sort_order": 1, }, table, ), self._union( { "col_name": "is_active", "data_type": "boolean", "col_description": None, "col_sort_order": 2, }, table, ), self._union( { "col_name": "source", "data_type": "varchar", "col_description": "description of source", "col_sort_order": 3, }, table, ), self._union( { "col_name": "etl_created_at", "data_type": "timestamp_ltz", "col_description": "description of etl_created_at", "col_sort_order": 4, }, table, ), self._union( { "col_name": "ds", "data_type": "varchar", "col_description": None, "col_sort_order": 5, }, table, ), ] extractor = SnowflakeMetadataExtractor() extractor.init(self.conf) actual = extractor.extract() expected = TableMetadata( "prod", "MY_CLUSTER", "test_schema", "test_table", "a table for testing", [ ColumnMetadata("col_id1", "description of id1", "number", 0), ColumnMetadata("col_id2", "description of id2", "number", 1), ColumnMetadata("is_active", None, "boolean", 2), ColumnMetadata("source", "description of source", "varchar", 3), ColumnMetadata( "etl_created_at", "description of etl_created_at", "timestamp_ltz", 4, ), ColumnMetadata("ds", None, "varchar", 5), ], ) self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())
def test_extraction_with_multiple_result(self) -> None: with patch.object(SQLAlchemyExtractor, "_get_connection") as mock_connection: connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() connection.execute = sql_execute table = { "schema": "test_schema1", "name": "test_table1", "description": "test table 1", "is_view": 0, "cluster": self.conf[PostgresMetadataExtractor.CLUSTER_KEY], } table1 = { "schema": "test_schema1", "name": "test_table2", "description": "test table 2", "is_view": 0, "cluster": self.conf[PostgresMetadataExtractor.CLUSTER_KEY], } table2 = { "schema": "test_schema2", "name": "test_table3", "description": "test table 3", "is_view": 0, "cluster": self.conf[PostgresMetadataExtractor.CLUSTER_KEY], } sql_execute.return_value = [ self._union( { "col_name": "col_id1", "data_type": "bigint", "col_description": "description of col_id1", "col_sort_order": 0, }, table, ), self._union( { "col_name": "col_id2", "data_type": "bigint", "col_description": "description of col_id2", "col_sort_order": 1, }, table, ), self._union( { "col_name": "is_active", "data_type": "boolean", "col_description": None, "col_sort_order": 2, }, table, ), self._union( { "col_name": "source", "data_type": "varchar", "col_description": "description of source", "col_sort_order": 3, }, table, ), self._union( { "col_name": "etl_created_at", "data_type": "timestamp", "col_description": "description of etl_created_at", "col_sort_order": 4, }, table, ), self._union( { "col_name": "ds", "data_type": "varchar", "col_description": None, "col_sort_order": 5, }, table, ), self._union( { "col_name": "col_name", "data_type": "varchar", "col_description": "description of col_name", "col_sort_order": 0, }, table1, ), self._union( { "col_name": "col_name2", "data_type": "varchar", "col_description": "description of col_name2", "col_sort_order": 1, }, table1, ), self._union( { "col_name": "col_id3", "data_type": "varchar", "col_description": "description of col_id3", "col_sort_order": 0, }, table2, ), self._union( { "col_name": "col_name3", "data_type": "varchar", "col_description": "description of col_name3", "col_sort_order": 1, }, table2, ), ] extractor = PostgresMetadataExtractor() extractor.init(self.conf) expected = TableMetadata( "postgres", self.conf[PostgresMetadataExtractor.CLUSTER_KEY], "test_schema1", "test_table1", "test table 1", [ ColumnMetadata("col_id1", "description of col_id1", "bigint", 0), ColumnMetadata("col_id2", "description of col_id2", "bigint", 1), ColumnMetadata("is_active", None, "boolean", 2), ColumnMetadata("source", "description of source", "varchar", 3), ColumnMetadata( "etl_created_at", "description of etl_created_at", "timestamp", 4, ), ColumnMetadata("ds", None, "varchar", 5), ], 0, ) self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) expected = TableMetadata( "postgres", self.conf[PostgresMetadataExtractor.CLUSTER_KEY], "test_schema1", "test_table2", "test table 2", [ ColumnMetadata("col_name", "description of col_name", "varchar", 0), ColumnMetadata("col_name2", "description of col_name2", "varchar", 1), ], 0, ) self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) expected = TableMetadata( "postgres", self.conf[PostgresMetadataExtractor.CLUSTER_KEY], "test_schema2", "test_table3", "test table 3", [ ColumnMetadata("col_id3", "description of col_id3", "varchar", 0), ColumnMetadata("col_name3", "description of col_name3", "varchar", 1), ], 0, ) self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) self.assertIsNone(extractor.extract()) self.assertIsNone(extractor.extract())
def test_extraction_with_single_result(self) -> None: with patch.object(GlueExtractor, "_search_tables") as mock_search: mock_search.return_value = [{ "Name": "test_catalog_test_schema_test_table", "DatabaseName": "test_database", "Description": "a table for testing", "StorageDescriptor": { "Columns": [ { "Name": "col_id1", "Type": "bigint", "Comment": "description of id1", }, { "Name": "col_id2", "Type": "bigint", "Comment": "description of id2", }, { "Name": "is_active", "Type": "boolean" }, { "Name": "source", "Type": "varchar", "Comment": "description of source", }, { "Name": "etl_created_at", "Type": "timestamp", "Comment": "description of etl_created_at", }, { "Name": "ds", "Type": "varchar" }, ], "Location": "test_catalog.test_schema.test_table", }, "PartitionKeys": [ { "Name": "partition_key1", "Type": "string", "Comment": "description of partition_key1", }, ], "TableType": "EXTERNAL_TABLE", }] extractor = GlueExtractor() extractor.init(self.conf) actual = extractor.extract() expected = TableMetadata( "test_database", None, None, "test_catalog_test_schema_test_table", "a table for testing", [ ColumnMetadata("col_id1", "description of id1", "bigint", 0), ColumnMetadata("col_id2", "description of id2", "bigint", 1), ColumnMetadata("is_active", None, "boolean", 2), ColumnMetadata("source", "description of source", "varchar", 3), ColumnMetadata( "etl_created_at", "description of etl_created_at", "timestamp", 4, ), ColumnMetadata("ds", None, "varchar", 5), ColumnMetadata("partition_key1", "description of partition_key1", "string", 6), ], False, ) self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())
def get_all_table_metadata_from_information_schema( self, cluster: Optional[str] = None, where_clause_suffix: str = "", ): unformatted_query = """ SELECT a.table_catalog AS cluster , a.table_schema AS schema , a.table_name AS name , NULL AS description , a.column_name AS col_name , a.ordinal_position as col_sort_order , IF(a.extra_info = 'partition key', 1, 0) AS is_partition_col , a.comment AS col_description , a.data_type , IF(b.table_name is not null, 1, 0) AS is_view FROM {cluster_prefix}information_schema.columns a LEFT JOIN {cluster_prefix}information_schema.views b ON a.table_catalog = b.table_catalog and a.table_schema = b.table_schema and a.table_name = b.table_name {where_clause_suffix} """ LOGGER.info( "Pulling all table metadata in bulk from" + "information_schema in cluster name: {}".format(cluster) ) if cluster is not None: cluster_prefix = cluster + "." else: cluster_prefix = "" formatted_query = unformatted_query.format( cluster_prefix=cluster_prefix, where_clause_suffix=where_clause_suffix ) LOGGER.info("SQL for presto: {}".format(formatted_query)) query_results = self.execute(formatted_query, is_dict_return_enabled=True) for _, group in groupby(query_results, self._get_table_key): columns = [] for row in group: last_row = row columns.append( ColumnMetadata( row["col_name"], row["col_description"], row["data_type"], row["col_sort_order"], ) ) yield TableMetadata( self._database, cluster or self._default_cluster_name, last_row["schema"], last_row["name"], last_row["description"], columns, is_view=bool(last_row["is_view"]), )