def test_sql_statement(self) -> None: """ Test DATABASE_KEY in extractor sql stmt """ with patch.object(SQLAlchemyExtractor, '_get_connection'): extractor = SnowflakeTableLastUpdatedExtractor() extractor.init(self.conf) self.assertFalse(self.database_key in extractor.sql_stmt)
def test_sql_statement(self) -> None: """ Test cluster_key in extractor sql stmt """ with patch.object(SQLAlchemyExtractor, '_get_connection'): extractor = SnowflakeTableLastUpdatedExtractor() extractor.init(self.conf) self.assertTrue(self.cluster_key in extractor.sql_stmt)
def test_sql_statement(self) -> None: """ test where clause in extractor sql statement """ with patch.object(SQLAlchemyExtractor, '_get_connection'): extractor = SnowflakeTableLastUpdatedExtractor() extractor.init(self.conf) self.assertTrue(self.where_clause_suffix in extractor.sql_stmt)
def test_sql_statement(self) -> None: """ Ensure catalog is used as cluster in extract sql stmt """ with patch.object(SQLAlchemyExtractor, '_get_connection'): extractor = SnowflakeTableLastUpdatedExtractor() extractor.init(self.conf) self.assertTrue('table_catalog' in extractor.sql_stmt) self.assertFalse(self.cluster_key in extractor.sql_stmt)
def test_extraction_with_empty_query_result(self) -> None: """ Test Extraction with empty result from query """ with patch.object(SQLAlchemyExtractor, '_get_connection'): extractor = SnowflakeTableLastUpdatedExtractor() extractor.init(self.conf) results = extractor.extract() self.assertIsNone(results)
def test_extraction_with_single_result(self) -> None: """ Test Extraction with default cluster and database and with one table as result """ with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection: connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() connection.execute = sql_execute sql_execute.return_value = [{ 'schema': 'test_schema', 'table_name': 'test_table', 'last_updated_time': 1000, 'cluster': self.conf['extractor.snowflake_table_last_updated.{}'.format( SnowflakeTableLastUpdatedExtractor.CLUSTER_KEY)], }] extractor = SnowflakeTableLastUpdatedExtractor() extractor.init(self.conf) actual = extractor.extract() expected = TableLastUpdated(schema='test_schema', table_name='test_table', last_updated_time_epoch=1000, db='snowflake', cluster='MY_CLUSTER') self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())
def test_extraction_with_database_specified(self) -> None: """ Test DATABASE_KEY in extractor result """ with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection: connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() connection.execute = sql_execute sql_execute.return_value = [{ 'schema': 'test_schema', 'table_name': 'test_table', 'last_updated_time': 1000, 'cluster': 'MY_CLUSTER', }] extractor = SnowflakeTableLastUpdatedExtractor() extractor.init(self.conf) actual = extractor.extract() expected = TableLastUpdated(schema='test_schema', table_name='test_table', last_updated_time_epoch=1000, db=self.database_key, cluster='MY_CLUSTER') self.assertEqual(expected.__repr__(), actual.__repr__()) self.assertIsNone(extractor.extract())
def create_sample_snowflake_last_updated_job(): where_clause = "WHERE c.TABLE_SCHEMA not in ({0}) \ AND c.TABLE_SCHEMA not like 'STAGE_%' \ AND c.TABLE_SCHEMA not like 'HIST_%' \ AND c.TABLE_SCHEMA not like 'SNAP_%' \ AND lower(c.COLUMN_NAME) not like 'dw_%';".format(','.join(IGNORED_SCHEMAS)) where_clause = ' WHERE t.last_altered IS NOT NULL ' tmp_folder = '/var/tmp/amundsen/{}'.format('tables') node_files_folder = '{tmp_folder}/nodes'.format(tmp_folder=tmp_folder) relationship_files_folder = '{tmp_folder}/relationships'.format(tmp_folder=tmp_folder) sql_extractor = SnowflakeTableLastUpdatedExtractor() csv_loader = FsNeo4jCSVLoader() task = DefaultTask(extractor=sql_extractor, loader=csv_loader) job_config = ConfigFactory.from_dict({ 'extractor.snowflake_table_last_updated.{}'.format(SnowflakeTableLastUpdatedExtractor.SNOWFLAKE_DATABASE_KEY): SNOWFLAKE_DATABASE_KEY, 'extractor.snowflake_table_last_updated.{}'.format(SnowflakeTableLastUpdatedExtractor.WHERE_CLAUSE_SUFFIX_KEY): where_clause, 'extractor.snowflake_table_last_updated.{}'.format(SnowflakeTableLastUpdatedExtractor.USE_CATALOG_AS_CLUSTER_NAME): True, 'extractor.snowflake_table_last_updated.extractor.sqlalchemy.{}'.format(SQLAlchemyExtractor.CONN_STRING): connection_string(), 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.NODE_DIR_PATH): node_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.RELATION_DIR_PATH): relationship_files_folder, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR): True, 'loader.filesystem_csv_neo4j.{}'.format(FsNeo4jCSVLoader.FORCE_CREATE_DIR): True, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NODE_FILES_DIR): node_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.RELATION_FILES_DIR): relationship_files_folder, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_END_POINT_KEY): neo4j_endpoint, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_USER): neo4j_user, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.NEO4J_PASSWORD): neo4j_password, 'publisher.neo4j.{}'.format(neo4j_csv_publisher.JOB_PUBLISH_TAG): 'unique_tag' }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def create_snowflake_last_update_job(*, database, ignore_schemas, conn_string, host, neo4j, **kwargs): node_files_folder = host["node_files_folder"] relationship_files_folder = host["relationship_files_folder"] where_clause = f"WHERE t.TABLE_SCHEMA not in (\'{', '.join(ignore_schemas)}\')" task = DefaultTask(extractor=SnowflakeTableLastUpdatedExtractor(), loader=FsNeo4jCSVLoader()) job_config = ConfigFactory.from_dict({ f'extractor.snowflake_table_last_updated.extractor.sqlalchemy.{SQLAlchemyExtractor.CONN_STRING}': conn_string, f'extractor.snowflake_table_last_updated.{SnowflakeTableLastUpdatedExtractor.SNOWFLAKE_DATABASE_KEY}': database, f'extractor.snowflake_table_last_updated.{SnowflakeTableLastUpdatedExtractor.WHERE_CLAUSE_SUFFIX_KEY}': where_clause, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.NODE_DIR_PATH}': node_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.RELATION_DIR_PATH}': relationship_files_folder, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.SHOULD_DELETE_CREATED_DIR}': True, f'loader.filesystem_csv_neo4j.{FsNeo4jCSVLoader.FORCE_CREATE_DIR}': True, f'publisher.neo4j.{neo4j_csv_publisher.NODE_FILES_DIR}': node_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.RELATION_FILES_DIR}': relationship_files_folder, f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_END_POINT_KEY}': neo4j["endpoint"], f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_USER}': neo4j["user"], f'publisher.neo4j.{neo4j_csv_publisher.NEO4J_PASSWORD}': neo4j["password"], f'publisher.neo4j.{neo4j_csv_publisher.JOB_PUBLISH_TAG}': 'unique_tag' }) job = DefaultJob(conf=job_config, task=task, publisher=Neo4jCsvPublisher()) return job
def test_extraction_with_multiple_result(self) -> None: """ Test Extraction with default cluster and database and with multiple tables as result """ with patch.object(SQLAlchemyExtractor, '_get_connection') as mock_connection: connection = MagicMock() mock_connection.return_value = connection sql_execute = MagicMock() connection.execute = sql_execute default_cluster = self.conf[ 'extractor.snowflake_table_last_updated.{}'.format( SnowflakeTableLastUpdatedExtractor.CLUSTER_KEY)] table = { 'schema': 'test_schema1', 'table_name': 'test_table1', 'last_updated_time': 1000, 'cluster': default_cluster } table1 = { 'schema': 'test_schema1', 'table_name': 'test_table2', 'last_updated_time': 2000, 'cluster': default_cluster } table2 = { 'schema': 'test_schema2', 'table_name': 'test_table3', 'last_updated_time': 3000, 'cluster': default_cluster } sql_execute.return_value = [table, table1, table2] extractor = SnowflakeTableLastUpdatedExtractor() extractor.init(self.conf) expected = TableLastUpdated(schema='test_schema1', table_name='test_table1', last_updated_time_epoch=1000, db='snowflake', cluster='MY_CLUSTER') self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) expected = TableLastUpdated(schema='test_schema1', table_name='test_table2', last_updated_time_epoch=2000, db='snowflake', cluster='MY_CLUSTER') self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) expected = TableLastUpdated(schema='test_schema2', table_name='test_table3', last_updated_time_epoch=3000, db='snowflake', cluster='MY_CLUSTER') self.assertEqual(expected.__repr__(), extractor.extract().__repr__()) self.assertIsNone(extractor.extract())