def test_view(self): query_runner = Athena({'glue': True, 'region': 'mars-east-1'}) self.stubber.add_response('get_databases', {'DatabaseList': [{'Name': 'test1'}]}, {}) self.stubber.add_response( 'get_tables', { 'TableList': [ { 'Name': 'view', 'StorageDescriptor': { 'Columns': [{'Name': 'sk', 'Type': 'int'}], 'Location': '', 'Compressed': False, 'NumberOfBuckets': 0, 'SerdeInfo': {}, 'SortColumns': [], 'StoredAsSubDirectories': False, }, 'PartitionKeys': [], 'ViewOriginalText': '/* Presto View: ... */', 'ViewExpandedText': '/* Presto View */', 'TableType': 'VIRTUAL_VIEW', 'Parameters': {'comment': 'Presto View', 'presto_view': 'true'}, } ] }, {'DatabaseName': 'test1'}, ) with self.stubber: assert query_runner.get_schema() == [{'columns': ['sk'], 'name': 'test1.view'}]
def test_no_storage_descriptor_table(self): """ For some reason, not all Glue tables contain a "StorageDescriptor" entry. """ query_runner = Athena({'glue': True, 'region': 'mars-east-1'}) self.stubber.add_response('get_databases', {'DatabaseList': [{ 'Name': 'test1' }]}, {}) self.stubber.add_response( 'get_tables', { 'TableList': [{ 'Name': 'no_storage_descriptor_table', 'PartitionKeys': [], 'TableType': 'EXTERNAL_TABLE', 'Parameters': { 'EXTERNAL': 'TRUE' }, }] }, {'DatabaseName': 'test1'}, ) with self.stubber: assert query_runner.get_schema() == []
def test_external_table(self): """Unpartitioned table crawled through a JDBC connection""" query_runner = Athena({"glue": True, "region": "mars-east-1"}) self.stubber.add_response("get_databases", {"DatabaseList": [{ "Name": "test1" }]}, {}) self.stubber.add_response( "get_tables", { "TableList": [{ "Name": "jdbc_table", "StorageDescriptor": { "Columns": [{ "Name": "row_id", "Type": "int" }], "Location": "Database.Schema.Table", "Compressed": False, "NumberOfBuckets": -1, "SerdeInfo": { "Parameters": {} }, "BucketColumns": [], "SortColumns": [], "Parameters": { "CrawlerSchemaDeserializerVersion": "1.0", "CrawlerSchemaSerializerVersion": "1.0", "UPDATED_BY_CRAWLER": "jdbc", "classification": "sqlserver", "compressionType": "none", "connectionName": "jdbctest", "typeOfData": "view", }, "StoredAsSubDirectories": False, }, "PartitionKeys": [], "TableType": "EXTERNAL_TABLE", "Parameters": { "CrawlerSchemaDeserializerVersion": "1.0", "CrawlerSchemaSerializerVersion": "1.0", "UPDATED_BY_CRAWLER": "jdbc", "classification": "sqlserver", "compressionType": "none", "connectionName": "jdbctest", "typeOfData": "view", }, }] }, {"DatabaseName": "test1"}, ) with self.stubber: assert query_runner.get_schema() == [{ "columns": ["row_id"], "name": "test1.jdbc_table" }]
def test_dodgy_table_does_not_break_schema_listing(self): """ For some reason, not all Glue tables contain a "PartitionKeys" entry. This may be a Athena Catalog to Glue catalog migration issue. """ query_runner = Athena({"glue": True, "region": "mars-east-1"}) self.stubber.add_response("get_databases", {"DatabaseList": [{ "Name": "test1" }]}, {}) self.stubber.add_response( "get_tables", { "TableList": [{ "Name": "csv", "StorageDescriptor": { "Columns": [{ "Name": "region", "Type": "string" }], "Location": "s3://bucket/files/", "InputFormat": "org.apache.hadoop.mapred.TextInputFormat", "Compressed": False, "NumberOfBuckets": 0, "SerdeInfo": { "SerializationLibrary": "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe", "Parameters": { "field.delim": "|", "skip.header.line.count": "1", }, }, "SortColumns": [], "StoredAsSubDirectories": False, }, "Parameters": { "classification": "csv" }, }] }, {"DatabaseName": "test1"}, ) with self.stubber: assert query_runner.get_schema() == [{ "columns": ["region"], "name": "test1.csv", "metadata": [{ "type": "string", "name": "region" }], }]
def test_external_table(self): """Unpartitioned table crawled through a JDBC connection""" query_runner = Athena({'glue': True, 'region': 'mars-east-1'}) self.stubber.add_response('get_databases', {'DatabaseList': [{'Name': 'test1'}]}, {}) self.stubber.add_response( 'get_tables', { 'TableList': [ { 'Name': 'jdbc_table', 'StorageDescriptor': { 'Columns': [{'Name': 'row_id', 'Type': 'int'}], 'Location': 'Database.Schema.Table', 'Compressed': False, 'NumberOfBuckets': -1, 'SerdeInfo': {'Parameters': {}}, 'BucketColumns': [], 'SortColumns': [], 'Parameters': { 'CrawlerSchemaDeserializerVersion': '1.0', 'CrawlerSchemaSerializerVersion': '1.0', 'UPDATED_BY_CRAWLER': 'jdbc', 'classification': 'sqlserver', 'compressionType': 'none', 'connectionName': 'jdbctest', 'typeOfData': 'view', }, 'StoredAsSubDirectories': False, }, 'PartitionKeys': [], 'TableType': 'EXTERNAL_TABLE', 'Parameters': { 'CrawlerSchemaDeserializerVersion': '1.0', 'CrawlerSchemaSerializerVersion': '1.0', 'UPDATED_BY_CRAWLER': 'jdbc', 'classification': 'sqlserver', 'compressionType': 'none', 'connectionName': 'jdbctest', 'typeOfData': 'view', }, } ] }, {'DatabaseName': 'test1'}, ) with self.stubber: assert query_runner.get_schema() == [{'columns': ['row_id'], 'name': 'test1.jdbc_table'}]
def test_view(self): query_runner = Athena({"glue": True, "region": "mars-east-1"}) self.stubber.add_response("get_databases", {"DatabaseList": [{ "Name": "test1" }]}, {}) self.stubber.add_response( "get_tables", { "TableList": [{ "Name": "view", "StorageDescriptor": { "Columns": [{ "Name": "sk", "Type": "int" }], "Location": "", "Compressed": False, "NumberOfBuckets": 0, "SerdeInfo": {}, "SortColumns": [], "StoredAsSubDirectories": False, }, "PartitionKeys": [], "ViewOriginalText": "/* Presto View: ... */", "ViewExpandedText": "/* Presto View */", "TableType": "VIRTUAL_VIEW", "Parameters": { "comment": "Presto View", "presto_view": "true" }, }] }, {"DatabaseName": "test1"}, ) with self.stubber: assert query_runner.get_schema() == [{ "columns": ["sk"], "name": "test1.view", "metadata": [{ "type": "int", "name": "sk" }], }]
def test_partitioned_table(self): """ Partitioned table as created by a GlueContext """ query_runner = Athena({'glue': True, 'region': 'mars-east-1'}) self.stubber.add_response('get_databases', {'DatabaseList': [{'Name': 'test1'}]}, {}) self.stubber.add_response( 'get_tables', { 'TableList': [ { 'Name': 'partitioned_table', 'StorageDescriptor': { 'Columns': [{'Name': 'sk', 'Type': 'int'}], 'Location': 's3://bucket/prefix', 'InputFormat': 'org.apache.hadoop.mapred.TextInputFormat', 'OutputFormat': 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat', 'Compressed': False, 'NumberOfBuckets': -1, 'SerdeInfo': { 'SerializationLibrary': 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe', 'Parameters': {'serialization.format': '1'}, }, 'BucketColumns': [], 'SortColumns': [], 'Parameters': {}, 'SkewedInfo': { 'SkewedColumnNames': [], 'SkewedColumnValues': [], 'SkewedColumnValueLocationMaps': {}, }, 'StoredAsSubDirectories': False, }, 'PartitionKeys': [{'Name': 'category', 'Type': 'int'}], 'TableType': 'EXTERNAL_TABLE', 'Parameters': {'EXTERNAL': 'TRUE', 'transient_lastDdlTime': '1537505313'}, } ] }, {'DatabaseName': 'test1'}, ) with self.stubber: assert query_runner.get_schema() == [{'columns': ['sk', 'category'], 'name': 'test1.partitioned_table'}]
def test_dodgy_table_does_not_break_schema_listing(self): """ For some reason, not all Glue tables contain a "PartitionKeys" entry. This may be a Athena Catalog to Glue catalog migration issue. """ query_runner = Athena({'glue': True, 'region': 'mars-east-1'}) self.stubber.add_response('get_databases', {'DatabaseList': [{'Name': 'test1'}]}, {}) self.stubber.add_response( 'get_tables', { 'TableList': [ { 'Name': 'csv', 'StorageDescriptor': { 'Columns': [{'Name': 'region', 'Type': 'string'}], 'Location': 's3://bucket/files/', 'InputFormat': 'org.apache.hadoop.mapred.TextInputFormat', 'Compressed': False, 'NumberOfBuckets': 0, 'SerdeInfo': { 'SerializationLibrary': 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe', 'Parameters': {'field.delim': '|', 'skip.header.line.count': '1'}, }, 'SortColumns': [], 'StoredAsSubDirectories': False, }, 'Parameters': {'classification': 'csv'}, } ] }, {'DatabaseName': 'test1'}, ) with self.stubber: assert query_runner.get_schema() == [{'columns': ['region'], 'name': 'test1.csv'}]
def setUp(self): self.query_runners = [{ 'instance': Presto({}), 'mock_location': 'presto.Presto' }, { 'instance': Athena({}), 'mock_location': 'athena.Athena' }, { 'instance': Mysql({'db': None}), 'mock_location': 'mysql.Mysql' }, { 'instance': PostgreSQL({}), 'mock_location': 'pg.PostgreSQL' }, { 'instance': Redshift({}), 'mock_location': 'pg.Redshift' }]
def setUp(self): self.query_runners = [ { "instance": Presto({}), "mock_location": "presto.Presto" }, { "instance": Athena({}), "mock_location": "athena.Athena" }, { "instance": Mysql({"db": None}), "mock_location": "mysql.Mysql" }, { "instance": PostgreSQL({}), "mock_location": "pg.PostgreSQL" }, { "instance": Redshift({}), "mock_location": "pg.Redshift" }, ]
def test_partitioned_table(self): """ Partitioned table as created by a GlueContext """ query_runner = Athena({"glue": True, "region": "mars-east-1"}) self.stubber.add_response("get_databases", {"DatabaseList": [{ "Name": "test1" }]}, {}) self.stubber.add_response( "get_tables", { "TableList": [{ "Name": "partitioned_table", "StorageDescriptor": { "Columns": [{ "Name": "sk", "Type": "int" }], "Location": "s3://bucket/prefix", "InputFormat": "org.apache.hadoop.mapred.TextInputFormat", "OutputFormat": "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat", "Compressed": False, "NumberOfBuckets": -1, "SerdeInfo": { "SerializationLibrary": "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe", "Parameters": { "serialization.format": "1" }, }, "BucketColumns": [], "SortColumns": [], "Parameters": {}, "SkewedInfo": { "SkewedColumnNames": [], "SkewedColumnValues": [], "SkewedColumnValueLocationMaps": {}, }, "StoredAsSubDirectories": False, }, "PartitionKeys": [{ "Name": "category", "Type": "int" }], "TableType": "EXTERNAL_TABLE", "Parameters": { "EXTERNAL": "TRUE", "transient_lastDdlTime": "1537505313", }, }] }, {"DatabaseName": "test1"}, ) with self.stubber: assert query_runner.get_schema() == [{ "columns": ["sk", "category"], "name": "test1.partitioned_table", "metadata": [ { "type": "int", "name": "sk" }, { "type": "int", "name": "category" }, ], }]