def _iterate_over_cols(self, parent: str, column: str, cols: List[ColumnMetadata], total_cols: int) -> int: if len(parent) > 0: col_name = \ '{parent}.{field}'.format(parent=parent, field=column['name']) else: col_name = column['name'] if column['type'] == 'RECORD': col = ColumnMetadata(name=col_name, description=column.get('description', ''), col_type=column['type'], sort_order=total_cols) cols.append(col) total_cols += 1 for field in column['fields']: total_cols = \ self._iterate_over_cols(col_name, field, cols, total_cols) return total_cols else: col = ColumnMetadata(name=col_name, description=column.get('description', ''), col_type=column['type'], sort_order=total_cols) cols.append(col) return total_cols + 1
def test_multiple_results(self, mock_client): col1_name = "col1" col2_name = "col2" col1_type = "int" col2_type = "char" col1_sort_order = "1" col2_sort_order = "2" mock_client.return_value.instance.return_value.database.return_value.snapshot.return_value.__enter__.return_value.execute_sql.return_value = [ [col1_name, col1_type, col1_sort_order, self.schema, self.table], [col2_name, col2_type, col2_sort_order, self.schema, self.table], ] extractor = SpannerMetadataExtractor() extractor.init( Scoped.get_scoped_conf(conf=self.conf, scope=extractor.get_scope()) ) result = extractor.extract() assert result.database == self.connection_name assert result.cluster == self.project_id assert result.schema == f"{self.instance_id}.{self.database_id}" assert result.name == self.table self.assertEqual( result.columns[0].__repr__(), ColumnMetadata( col1_name, None, col1_type, col1_sort_order, None ).__repr__(), ) self.assertEqual( result.columns[1].__repr__(), ColumnMetadata( col2_name, None, col2_type, col2_sort_order, None ).__repr__(), )
def _iterate_over_cols(self, parent: str, column: str, cols: List[ColumnMetadata], total_cols: int) -> int: if len(parent) > 0: col_name = "{parent}.{field}".format(parent=parent, field=column["name"]) else: col_name = column["name"] if column["type"] == "RECORD": col = ColumnMetadata( name=col_name, description=column.get("description", ""), col_type=column["type"], sort_order=total_cols, ) cols.append(col) total_cols += 1 for field in column["fields"]: total_cols = self._iterate_over_cols(col_name, field, cols, total_cols) return total_cols else: col = ColumnMetadata( name=col_name, description=column.get("description", ""), col_type=column["type"], sort_order=total_cols, ) cols.append(col) return total_cols + 1
def test_get_all_table_metadata_from_information_schema( self, mock_settings) -> None: self.engine.init(self.conf) self.engine.execute = MagicMock( side_effect=presto_engine_execute_side_effect) mock_columns = [ ColumnMetadata( name=MOCK_INFORMATION_SCHEMA_RESULT_1['col_name'], description=MOCK_INFORMATION_SCHEMA_RESULT_1[ 'col_description'], # noqa: 501 col_type=MOCK_INFORMATION_SCHEMA_RESULT_1['col_type'], sort_order=MOCK_INFORMATION_SCHEMA_RESULT_1['col_sort_order'], is_partition_column=None), ColumnMetadata( name=MOCK_INFORMATION_SCHEMA_RESULT_2['col_name'], description=MOCK_INFORMATION_SCHEMA_RESULT_2[ 'col_description'], # noqa: 501 col_type=MOCK_INFORMATION_SCHEMA_RESULT_2['col_type'], sort_order=MOCK_INFORMATION_SCHEMA_RESULT_2['col_sort_order'], is_partition_column=None) ] expected = TableMetadata( database=MOCK_DATABASE_NAME, cluster=MOCK_CLUSTER_NAME, schema=MOCK_SCHEMA_NAME, name=MOCK_TABLE_NAME, columns=mock_columns, is_view=bool(MOCK_INFORMATION_SCHEMA_RESULT_1['is_view']), ) results = self.engine.get_all_table_metadata_from_information_schema( cluster=MOCK_CLUSTER_NAME) result = next(results) self.maxDiff = None self.assertEqual(result.__repr__(), expected.__repr__())
def _get_extract_iter(self) -> Iterator[TableMetadata]: for row in self._get_raw_extract_iter(): columns, i = [], 0 for column in row["StorageDescriptor"]["Columns"] + row.get( "PartitionKeys", []): columns.append( ColumnMetadata( column["Name"], column["Comment"] if "Comment" in column else None, column["Type"], i, )) i += 1 catalog, schema, table = self._parse_location( location=row["StorageDescriptor"]["Location"], name=row["Name"]) if self._connection_name: database = self._connection_name + "/" + row["DatabaseName"] else: database = row["DatabaseName"] yield TableMetadata( database, catalog, schema, table, row.get("Description") or row.get("Parameters", {}).get("comment"), columns, row.get("TableType") == "VIRTUAL_VIEW", )
def test_table_metadata_extraction_with_single_result(self, mock1, mock2) -> None: extractor = PrestoLoopExtractor() conf = self.conf.copy() conf.put("is_table_metadata_enabled", True) extractor.init(conf) extractor.execute = MagicMock( side_effect=presto_engine_execute_side_effect) results = extractor.extract() is_partition_column = (True if MOCK_COLUMN_RESULT[2] == "partition key" else False) expected = TableMetadata( database=extractor._database, cluster=None, schema=MOCK_SCHEMA_NAME, name=MOCK_TABLE_NAME, columns=[ ColumnMetadata( name=MOCK_COLUMN_RESULT[0], description=MOCK_COLUMN_RESULT[3], data_type=MOCK_COLUMN_RESULT[1], sort_order=0, is_partition_column=is_partition_column, ) ], ) self.assertEqual(results.__repr__(), expected.__repr__())
def test_transformed_record_contains_components(self): """""" column = ColumnMetadata( name=COLUMN, col_type="Integer", sort_order=0, description=COLUMN_DESCRIPTION, ) record = TableMetadata( database=DATABASE, cluster=CLUSTER, schema=SCHEMA, name=TABLE, columns=[column], ) components = [ DATABASE, CLUSTER, SCHEMA, TABLE, COLUMN, COLUMN_DESCRIPTION, ] transformer = MarkdownTransformer() transformer.init(self._conf) transformed_record = transformer.transform(record) markdown_blob = transformed_record.markdown_blob transformer.close() has_components = all(x in markdown_blob for x in components) self.assertEqual(has_components, True)
def _get_extract_iter(self) -> Iterator[TableMetadata]: """ Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata :return: """ for _, group in groupby(self._get_raw_extract_iter(), self._get_table_key): columns = [] for row in group: column_description = \ unidecode(row['col_description']) \ if row['col_description'] else None last_row = row columns.append( ColumnMetadata(name=row['col_name'], description=column_description, col_type=row['col_type'], sort_order=row['col_sort_order'])) description = \ unidecode(last_row['description']) \ if last_row['description'] else None yield TableMetadata(database=self._database, cluster=last_row['cluster'], schema=last_row['schema'], name=last_row['name'], description=description, columns=columns, is_view=last_row['is_view'] == 'true')
def _get_extract_iter(self) -> Iterator[TableMetadata]: """ Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata :return: """ for key, group in groupby(self._get_raw_extract_iter(), self._get_table_key): columns = [] for row in group: last_row = row columns.append( ColumnMetadata( row["col_name"], row["col_description"], row["data_type"], row["col_sort_order"], ) ) yield TableMetadata( self._database, last_row["cluster"], last_row["schema"], last_row["name"], last_row["description"], columns, )
def _get_extract_iter(self) -> Iterator[TableMetadata]: """ Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata :return: """ for _, group in groupby(self._get_raw_extract_iter(), self._get_table_key): columns = [] for row in group: last_row = row columns.append( ColumnMetadata( name=row["column_name"], description=None, col_type=row["column_type"], sort_order=row["column_sort_order"], )) yield TableMetadata( database=self._database, cluster=None, schema=last_row["schema_name"], name=last_row["table_name"], description=None, columns=columns, is_view=last_row["table_type"] == "V", )
def _iterate_over_cols( self, tags_dict: dict, parent: str, column: str, cols: List[ColumnMetadata], total_cols: int, ) -> int: if len(parent) > 0: col_name = "{parent}.{field}".format(parent=parent, field=column["name"]) else: col_name = column["name"] tags = None if tags_dict and "tags" in tags_dict: for tag in tags_dict["tags"]: if "column" in tag: if tag["column"] == col_name: tags = tag if column["type"] == "RECORD": col = ColumnMetadata( name=col_name, description=column.get("description", ""), col_type=column["type"], sort_order=total_cols, tags=tags, ) cols.append(col) total_cols += 1 for field in column["fields"]: total_cols = self._iterate_over_cols(tags_dict, col_name, field, cols, total_cols) return total_cols else: col = ColumnMetadata( name=col_name, description=column.get("description", ""), col_type=column["type"], sort_order=total_cols, tags=tags, ) cols.append(col) return total_cols + 1
def get_table_metadata( self, schema: str, table: str, cluster: Optional[str] = None, is_view_query_enabled: Optional[bool] = False, ): # Format table and schema addresses for queries. full_schema_address = self._get_full_schema_address(cluster, schema) full_table_address = "{}.{}".format(full_schema_address, table) # Execute query that gets column type + partition information. columns_query = "show columns in {}".format(full_table_address) column_query_results = self.execute(columns_query, has_header=True) column_query_field_names = next(column_query_results) columns = [] for i, column_query_result in enumerate(column_query_results): column_dict = dict(zip(column_query_field_names, column_query_result)) columns.append( ColumnMetadata( name=column_dict["Column"], description=column_dict["Comment"], col_type=column_dict["Type"], sort_order=i, is_partition_column=column_dict["Extra"] == "partition key", ) ) if is_view_query_enabled: # Execute query that returns if table is a view. view_query = """ select table_type from information_schema.tables where table_schema='{table_schema}' and table_name='{table_name}' """.format( table_schema=schema, table_name=table ) view_query_results = self.execute(view_query, has_header=False) is_view = next(view_query_results)[0] == "VIEW" else: is_view = False return TableMetadata( database=self._database, cluster=cluster, schema=schema, name=table, description=None, columns=columns, is_view=is_view, )
def _get_extract_iter(self): with self.driver.session() as session: if not hasattr(self, "results"): self.results = session.read_transaction(self._execute_query) for result in self.results: # Parse watermark information. partition_columns = [] for watermark in result["watermarks"]: partition_columns.append(watermark["partition_key"]) # Parse column information. column_names = result["column_names"] column_descriptions = result["column_descriptions"] column_types = result["column_types"] column_sort_orders = result["column_sort_orders"] zipped_columns = zip_longest(column_names, column_descriptions, column_types, column_sort_orders) column_metadatas = [] for ( column_name, column_description, column_type, column_sort_order, ) in zipped_columns: if column_name in partition_columns: is_partition_column = True else: is_partition_column = False column_metadatas.append( ColumnMetadata( name=column_name, description=column_description, col_type=column_type, sort_order=column_sort_order, is_partition_column=is_partition_column, )) yield TableMetadata( database=result["database"], cluster=result["cluster"], schema=result["schema"], name=result["name"], description=result["description"], columns=column_metadatas, is_view=result["is_view"], tags=result["tags"], )
def _get_extract_iter(self): with self.driver.session() as session: if not hasattr(self, 'results'): self.results = session.read_transaction(self._execute_query) for result in self.results: # Parse watermark information. partition_columns = [] for watermark in result['watermarks']: partition_columns.append(watermark['partition_key']) # Parse column information. column_names = result['column_names'] column_descriptions = result['column_descriptions'] column_types = result['column_types'] column_sort_orders = result['column_sort_orders'] zipped_columns = zip_longest(column_names, column_descriptions, column_types, column_sort_orders) column_metadatas = [] for column_name, \ column_description, \ column_type, \ column_sort_order \ in zipped_columns: if column_name in partition_columns: is_partition_column = True else: is_partition_column = False column_metadatas.append( ColumnMetadata( name=column_name, description=column_description, col_type=column_type, sort_order=column_sort_order, is_partition_column=is_partition_column, )) yield TableMetadata( database=result['database'], cluster=result['cluster'], schema=result['schema'], name=result['name'], description=result['description'], columns=column_metadatas, is_view=result['is_view'], tags=result['tags'], )
def test_extraction_with_single_result(self): with patch.object(splice_machine_metadata_extractor, "splice_connect") as mock_connect: column = ColumnMetadata("column1", None, "int", 0) table = TableMetadata( self.DATABASE, self.CLUSTER, "test_schema", "test_table", None, [column], ) # Connection returns a cursor mock_cursor = MagicMock() mock_execute = MagicMock() mock_fetchall = MagicMock() # self.connection = splice_connect(...) mock_connection = MagicMock() mock_connect.return_value = mock_connection # self.cursor = self.connection.cursor() mock_connection.cursor.return_value = mock_cursor # self.cursor.execute(...) mock_cursor.execute = mock_execute # for row in self.cursor.fetchall() mock_cursor.fetchall = mock_fetchall mock_fetchall.return_value = [[ table.schema, table.name, "not-a-view", column.name, column.sort_order, column.type, ]] extractor = self.Extractor() extractor.init(self.conf) actual = extractor.extract() expected = table self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())
def _get_extract_iter(self): # type: () -> Iterator[TableMetadata] """ Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata :return: """ with self.database.snapshot() as snapshot: results = snapshot.execute_sql(self.sql_stmt) header = SpannerMetadataExtractor.HEADER headered_results = [ dict(zip(header, result)) for result in results ] for _, group in groupby(headered_results, self._get_table_key): columns = [] for row in group: last_row = row columns.append( ColumnMetadata( row["col_name"], None, row["col_type"], row["col_sort_order"], )) schema = "{}.{}".format(self._instance_id, self._database_id) yield TableMetadata( database=self._connection_name or "spanner", cluster=self._project_id, schema=schema, name=last_row["name"], description=None, columns=columns, )
def _get_extract_iter(self) -> Iterator[TableMetadata]: """ Using itertools.groupby and raw level iterator, it groups to table and yields TableMetadata :return: """ for _, group in groupby(self._get_raw_extract_iter(), self._get_table_key): columns = [] for row in group: column_description = ( unidecode(row["col_description"]) if row["col_description"] else None ) last_row = row columns.append( ColumnMetadata( name=row["col_name"], description=column_description, col_type=row["col_type"], sort_order=row["col_sort_order"], ) ) description = ( unidecode(last_row["description"]) if last_row["description"] else None ) yield TableMetadata( database=self._database, cluster=last_row["cluster"], schema=last_row["schema"], name=last_row["name"], description=description, columns=columns, is_view=last_row["is_view"] == "true", )
def get_all_table_metadata_from_information_schema( self, cluster: Optional[str] = None, where_clause_suffix: str = "", ): unformatted_query = """ SELECT a.table_catalog AS cluster , a.table_schema AS schema , a.table_name AS name , NULL AS description , a.column_name AS col_name , a.ordinal_position as col_sort_order , IF(a.extra_info = 'partition key', 1, 0) AS is_partition_col , a.comment AS col_description , a.data_type AS col_type , IF(b.table_name is not null, 1, 0) AS is_view FROM {cluster_prefix}information_schema.columns a LEFT JOIN {cluster_prefix}information_schema.views b ON a.table_catalog = b.table_catalog and a.table_schema = b.table_schema and a.table_name = b.table_name {where_clause_suffix} """ LOGGER.info( "Pulling all table metadata in bulk from" + "information_schema in cluster name: {}".format(cluster) ) if cluster is not None: cluster_prefix = cluster + "." else: cluster_prefix = "" formatted_query = unformatted_query.format( cluster_prefix=cluster_prefix, where_clause_suffix=where_clause_suffix ) LOGGER.info("SQL for presto: {}".format(formatted_query)) query_results = self.execute(formatted_query, is_dict_return_enabled=True) for _, group in groupby(query_results, self._get_table_key): columns = [] for row in group: last_row = row columns.append( ColumnMetadata( row["col_name"], row["col_description"], row["col_type"], row["col_sort_order"], ) ) yield TableMetadata( self._database, cluster or self._default_cluster_name, last_row["schema"], last_row["name"], last_row["description"], columns, is_view=bool(last_row["is_view"]), )
def test_extraction_with_single_result(self): # type: () -> None with patch.object(SQLAlchemyExtractor, "_get_connection") as mock_connection: connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() connection.execute = sql_execute table = { "schema": "test_schema", "name": "test_table", "description": "a table for testing", "cluster": self.conf[SnowflakeMetadataExtractor.CLUSTER_KEY], "is_view": "false", } sql_execute.return_value = [ self._union( { "col_name": "col_id1", "col_type": "number", "col_description": "description of id1", "col_sort_order": 0, }, table, ), self._union( { "col_name": "col_id2", "col_type": "number", "col_description": "description of id2", "col_sort_order": 1, }, table, ), self._union( { "col_name": "is_active", "col_type": "boolean", "col_description": None, "col_sort_order": 2, }, table, ), self._union( { "col_name": "source", "col_type": "varchar", "col_description": "description of source", "col_sort_order": 3, }, table, ), self._union( { "col_name": "etl_created_at", "col_type": "timestamp_ltz", "col_description": "description of etl_created_at", "col_sort_order": 4, }, table, ), self._union( { "col_name": "ds", "col_type": "varchar", "col_description": None, "col_sort_order": 5, }, table, ), ] extractor = SnowflakeMetadataExtractor() extractor.init(self.conf) actual = extractor.extract() expected = TableMetadata( "prod", "MY_CLUSTER", "test_schema", "test_table", "a table for testing", [ ColumnMetadata("col_id1", "description of id1", "number", 0), ColumnMetadata("col_id2", "description of id2", "number", 1), ColumnMetadata("is_active", None, "boolean", 2), ColumnMetadata("source", "description of source", "varchar", 3), ColumnMetadata( "etl_created_at", "description of etl_created_at", "timestamp_ltz", 4, ), ColumnMetadata("ds", None, "varchar", 5), ], ) self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())
def test_extraction_with_single_result(self) -> None: with patch.object(GlueExtractor, "_search_tables") as mock_search: mock_search.return_value = [{ "Name": "test_catalog_test_schema_test_table", "DatabaseName": "test_database", "Description": "a table for testing", "StorageDescriptor": { "Columns": [ { "Name": "col_id1", "Type": "bigint", "Comment": "description of id1", }, { "Name": "col_id2", "Type": "bigint", "Comment": "description of id2", }, { "Name": "is_active", "Type": "boolean" }, { "Name": "source", "Type": "varchar", "Comment": "description of source", }, { "Name": "etl_created_at", "Type": "timestamp", "Comment": "description of etl_created_at", }, { "Name": "ds", "Type": "varchar" }, ], "Location": "test_catalog.test_schema.test_table", }, "PartitionKeys": [ { "Name": "partition_key1", "Type": "string", "Comment": "description of partition_key1", }, ], "TableType": "EXTERNAL_TABLE", }] extractor = GlueExtractor() extractor.init(self.conf) actual = extractor.extract() expected = TableMetadata( "test_database", None, None, "test_catalog_test_schema_test_table", "a table for testing", [ ColumnMetadata("col_id1", "description of id1", "bigint", 0), ColumnMetadata("col_id2", "description of id2", "bigint", 1), ColumnMetadata("is_active", None, "boolean", 2), ColumnMetadata("source", "description of source", "varchar", 3), ColumnMetadata( "etl_created_at", "description of etl_created_at", "timestamp", 4, ), ColumnMetadata("ds", None, "varchar", 5), ColumnMetadata("partition_key1", "description of partition_key1", "string", 6), ], False, ) self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())
def test_extraction_with_single_result(self): # type: () -> None with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection: connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() connection.execute = sql_execute table = {'schema': 'test_schema', 'name': 'test_table', 'description': 'a table for testing', 'cluster': self.conf[SnowflakeMetadataExtractor.CLUSTER_KEY], 'is_view': 'false' } sql_execute.return_value = [ self._union( {'col_name': 'col_id1', 'col_type': 'number', 'col_description': 'description of id1', 'col_sort_order': 0}, table), self._union( {'col_name': 'col_id2', 'col_type': 'number', 'col_description': 'description of id2', 'col_sort_order': 1}, table), self._union( {'col_name': 'is_active', 'col_type': 'boolean', 'col_description': None, 'col_sort_order': 2}, table), self._union( {'col_name': 'source', 'col_type': 'varchar', 'col_description': 'description of source', 'col_sort_order': 3}, table), self._union( {'col_name': 'etl_created_at', 'col_type': 'timestamp_ltz', 'col_description': 'description of etl_created_at', 'col_sort_order': 4}, table), self._union( {'col_name': 'ds', 'col_type': 'varchar', 'col_description': None, 'col_sort_order': 5}, table) ] extractor = SnowflakeMetadataExtractor() extractor.init(self.conf) actual = extractor.extract() expected = TableMetadata('prod', 'MY_CLUSTER', 'test_schema', 'test_table', 'a table for testing', [ColumnMetadata('col_id1', 'description of id1', 'number', 0), ColumnMetadata('col_id2', 'description of id2', 'number', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp_ltz', 4), ColumnMetadata('ds', None, 'varchar', 5)]) self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())