def test_sql_statement(self) -> None: """ Test Extraction with empty result from query """ with patch.object(SQLAlchemyExtractor, '_get_connection'): extractor = SnowflakeMetadataExtractor() extractor.init(self.conf) self.assertFalse(self.database_key in extractor.sql_stmt)
def test_sql_statement(self): # type: () -> None """ Test Extraction with empty result from query """ with patch.object(SQLAlchemyExtractor, '_get_connection'): extractor = SnowflakeMetadataExtractor() extractor.init(self.conf) self.assertTrue(self.where_clause_suffix in extractor.sql_stmt)
def test_sql_statement(self) -> None: """ Test Extraction with empty result from query """ with patch.object(SQLAlchemyExtractor, '_get_connection'): extractor = SnowflakeMetadataExtractor() extractor.init(self.conf) self.assertTrue(SnowflakeMetadataExtractor.DEFAULT_CLUSTER_NAME in extractor.sql_stmt)
def test_sql_statement(self): # type: () -> None """ Test Extraction with empty result from query """ with patch.object(SQLAlchemyExtractor, '_get_connection'): extractor = SnowflakeMetadataExtractor() extractor.init(self.conf) self.assertTrue('table_catalog' in extractor.sql_stmt) self.assertFalse(self.cluster_key in extractor.sql_stmt)
def test_extraction_with_empty_query_result(self) -> None: """ Test Extraction with empty result from query """ with patch.object(SQLAlchemyExtractor, '_get_connection'): extractor = SnowflakeMetadataExtractor() extractor.init(self.conf) results = extractor.extract() self.assertEqual(results, None)
def create_snowflake_table_metadata_job(): """ Launches databuilder job that extracts table and column metadata from Snowflake database and publishes to Neo4j. """ where_clause_suffix = textwrap.dedent(""" WHERE c.TABLE_SCHEMA IN {schemas} AND lower(c.COLUMN_NAME) not like 'dw_%'; """).format(schemas=SUPPORTED_SCHEMA_SQL_IN_CLAUSE) tmp_folder = '/var/tmp/amundsen/table_metadata' node_files_folder = f'{tmp_folder}/nodes/' relationship_files_folder = f'{tmp_folder}/relationships/' job_config = ConfigFactory.from_dict({ f'extractor.snowflake.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(), f'extractor.snowflake.{SnowflakeMetadataExtractor.SNOWFLAKE_DATABASE_KEY}': SNOWFLAKE_DATABASE_KEY, f'extractor.snowflake.{SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause_suffix, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'some_unique_tag' # TO-DO unique tag must be added }) job = DefaultJob(conf=job_config, task=DefaultTask(extractor=SnowflakeMetadataExtractor(), loader=FsNeo4jCSVLoader()), publisher=Neo4jCsvPublisher()) job.launch()
def _create_snowflake_extractor( source: CatSource, ) -> Tuple[SnowflakeMetadataExtractor, Any]: extractor = SnowflakeMetadataExtractor() scope = extractor.get_scope() conn_string_key = f"{scope}.{SQLAlchemyExtractor().get_scope()}.{SQLAlchemyExtractor.CONN_STRING}" conf = ConfigFactory.from_dict( { conn_string_key: source.conn_string, f"{scope}.{SnowflakeMetadataExtractor.CLUSTER_KEY}": source.cluster, f"{scope}.{SnowflakeMetadataExtractor.DATABASE_KEY}": source.database, f"{scope}.{SnowflakeMetadataExtractor.SNOWFLAKE_DATABASE_KEY}": source.database, # f"{scope}.{SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}": connection.where_clause_suffix, } ) return extractor, conf
def test_extraction_with_database_specified(self): # type: () -> None with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection: connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() connection.execute = sql_execute sql_execute.return_value = [{ 'schema': 'test_schema', 'name': 'test_table', 'description': 'a table for testing', 'cluster': 'MY_CLUSTER', 'is_view': 'false', 'col_name': 'ds', 'col_type': 'varchar', 'col_description': None, 'col_sort_order': 0 }] extractor = SnowflakeMetadataExtractor() extractor.init(self.conf) actual = extractor.extract() expected = TableMetadata( self.database_key, 'MY_CLUSTER', 'test_schema', 'test_table', 'a table for testing', [ColumnMetadata('ds', None, 'varchar', 0)]) self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())
def create_sample_snowflake_job(): where_clause = "WHERE c.TABLE_SCHEMA not in ({0}) \ AND c.TABLE_SCHEMA not like 'STAGE_%' \ AND c.TABLE_SCHEMA not like 'HIST_%' \ AND c.TABLE_SCHEMA not like 'SNAP_%' \ AND lower(c.COLUMN_NAME) not like 'dw_%';".format( ','.join(IGNORED_SCHEMAS)) tmp_folder = '/var/tmp/amundsen/{}'.format('tables') node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format( tmp_folder=tmp_folder) sql_extractor = SnowflakeMetadataExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=sql_extractor, loader=csv_loader) job_config = ConfigFactory.from_dict({ 'extractor.snowflake.extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): SNOWFLAKE_CONN_STRING, 'extractor.snowflake.{}'.format(SnowflakeMetadataExtractor.DATABASE_KEY): 'YourSnowflakeDbName', 'extractor.snowflake.{}'.format(SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY): where_clause, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.FORCE_CREATE_DIR): True, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag' }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def create_sample_snowflake_job(): where_clause = f"WHERE c.TABLE_SCHEMA not in ({','.join(IGNORED_SCHEMAS)}) \ AND c.TABLE_SCHEMA not like 'STAGE_%' \ AND c.TABLE_SCHEMA not like 'HIST_%' \ AND c.TABLE_SCHEMA not like 'SNAP_%' \ AND lower(c.COLUMN_NAME) not like 'dw_%';" tmp_folder = '/var/tmp/amundsen/tables' node_files_folder = f'{tmp_folder}/nodes' relationship_files_folder = f'{tmp_folder}/relationships' sql_extractor = SnowflakeMetadataExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=sql_extractor, loader=csv_loader) job_config = ConfigFactory.from_dict({ f'extractor.snowflake.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': connection_string(), f'extractor.snowflake.{SnowflakeMetadataExtractor.SNOWFLAKE_DATABASE_KEY}': SNOWFLAKE_DATABASE_KEY, f'extractor.snowflake.{SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}': True, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.FORCE_CREATE_DIR}': True, f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j_endpoint, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j_user, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j_password, f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag' }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def create_snowflake_metadata_job(*, database, ignore_schemas, conn_string, host, neo4j, **kwargs): node_files_folder = host["node_files_folder"] relationship_files_folder = host["relationship_files_folder"] where_clause = f"WHERE c.TABLE_SCHEMA not in (\'{', '.join(ignore_schemas)}\')" task = DefaultTask(extractor=SnowflakeMetadataExtractor(), loader=FsNeo4jCSVLoader()) job_config = ConfigFactory.from_dict({ f'extractor.snowflake.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': conn_string, f'extractor.snowflake.{SnowflakeMetadataExtractor.SNOWFLAKE_DATABASE_KEY}': database, f'extractor.snowflake.{SnowflakeMetadataExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}': True, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.FORCE_CREATE_DIR}': True, f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j["endpoint"], f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j["user"], f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j["password"], f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag' }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def test_extraction_with_single_result(self): # type: () -> None with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection: connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() connection.execute = sql_execute table = { 'schema': 'test_schema', 'name': 'test_table', 'description': 'a table for testing', 'cluster': self.conf['extractor.snowflake_metadata.{}'.format( SnowflakeMetadataExtractor.CLUSTER_KEY)], 'is_view': 'false' } sql_execute.return_value = [ self._union( { 'col_name': 'col_id1', 'col_type': 'number', 'col_description': 'description of id1', 'col_sort_order': 0 }, table), self._union( { 'col_name': 'col_id2', 'col_type': 'number', 'col_description': 'description of id2', 'col_sort_order': 1 }, table), self._union( { 'col_name': 'is_active', 'col_type': 'boolean', 'col_description': None, 'col_sort_order': 2 }, table), self._union( { 'col_name': 'source', 'col_type': 'varchar', 'col_description': 'description of source', 'col_sort_order': 3 }, table), self._union( { 'col_name': 'etl_created_at', 'col_type': 'timestamp_ltz', 'col_description': 'description of etl_created_at', 'col_sort_order': 4 }, table), self._union( { 'col_name': 'ds', 'col_type': 'varchar', 'col_description': None, 'col_sort_order': 5 }, table) ] extractor = SnowflakeMetadataExtractor() extractor.init(self.conf) actual = extractor.extract() expected = TableMetadata( 'snowflake', 'MY_CLUSTER', 'test_schema', 'test_table', 'a table for testing', [ ColumnMetadata('col_id1', 'description of id1', 'number', 0), ColumnMetadata('col_id2', 'description of id2', 'number', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp_ltz', 4), ColumnMetadata('ds', None, 'varchar', 5) ]) self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())
def test_extraction_with_multiple_result(self) -> None: with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection: connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() connection.execute = sql_execute table = { 'schema': 'test_schema1', 'name': 'test_table1', 'description': 'test table 1', 'cluster': self.conf[ f'extractor.snowflake_metadata.{SnowflakeMetadataExtractor.CLUSTER_KEY}'], 'is_view': 'nottrue' } table1 = { 'schema': 'test_schema1', 'name': 'test_table2', 'description': 'test table 2', 'cluster': self.conf[ f'extractor.snowflake_metadata.{SnowflakeMetadataExtractor.CLUSTER_KEY}'], 'is_view': 'false' } table2 = { 'schema': 'test_schema2', 'name': 'test_table3', 'description': 'test table 3', 'cluster': self.conf[ f'extractor.snowflake_metadata.{SnowflakeMetadataExtractor.CLUSTER_KEY}'], 'is_view': 'true' } sql_execute.return_value = [ self._union( { 'col_name': 'col_id1', 'col_type': 'number', 'col_description': 'description of col_id1', 'col_sort_order': 0 }, table), self._union( { 'col_name': 'col_id2', 'col_type': 'number', 'col_description': 'description of col_id2', 'col_sort_order': 1 }, table), self._union( { 'col_name': 'is_active', 'col_type': 'boolean', 'col_description': None, 'col_sort_order': 2 }, table), self._union( { 'col_name': 'source', 'col_type': 'varchar', 'col_description': 'description of source', 'col_sort_order': 3 }, table), self._union( { 'col_name': 'etl_created_at', 'col_type': 'timestamp_ltz', 'col_description': 'description of etl_created_at', 'col_sort_order': 4 }, table), self._union( { 'col_name': 'ds', 'col_type': 'varchar', 'col_description': None, 'col_sort_order': 5 }, table), self._union( { 'col_name': 'col_name', 'col_type': 'varchar', 'col_description': 'description of col_name', 'col_sort_order': 0 }, table1), self._union( { 'col_name': 'col_name2', 'col_type': 'varchar', 'col_description': 'description of col_name2', 'col_sort_order': 1 }, table1), self._union( { 'col_name': 'col_id3', 'col_type': 'varchar', 'col_description': 'description of col_id3', 'col_sort_order': 0 }, table2), self._union( { 'col_name': 'col_name3', 'col_type': 'varchar', 'col_description': 'description of col_name3', 'col_sort_order': 1 }, table2) ] extractor = SnowflakeMetadataExtractor() extractor.init(self.conf) expected = TableMetadata( 'snowflake', self.conf[ f'extractor.snowflake_metadata.{SnowflakeMetadataExtractor.CLUSTER_KEY}'], 'test_schema1', 'test_table1', 'test table 1', [ ColumnMetadata('col_id1', 'description of col_id1', 'number', 0), ColumnMetadata('col_id2', 'description of col_id2', 'number', 1), ColumnMetadata('is_active', None, 'boolean', 2), ColumnMetadata('source', 'description of source', 'varchar', 3), ColumnMetadata('etl_created_at', 'description of etl_created_at', 'timestamp_ltz', 4), ColumnMetadata('ds', None, 'varchar', 5) ]) self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) expected = TableMetadata( 'snowflake', self.conf[ f'extractor.snowflake_metadata.{SnowflakeMetadataExtractor.CLUSTER_KEY}'], 'test_schema1', 'test_table2', 'test table 2', [ ColumnMetadata('col_name', 'description of col_name', 'varchar', 0), ColumnMetadata('col_name2', 'description of col_name2', 'varchar', 1) ]) self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) expected = TableMetadata( 'snowflake', self.conf[ f'extractor.snowflake_metadata.{SnowflakeMetadataExtractor.CLUSTER_KEY}'], 'test_schema2', 'test_table3', 'test table 3', [ ColumnMetadata('col_id3', 'description of col_id3', 'varchar', 0), ColumnMetadata('col_name3', 'description of col_name3', 'varchar', 1) ], True) self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) self.assertIsNone(extractor.extract()) self.assertIsNone(extractor.extract())