Exemplo n.º 1
0
    def test_view(self):
        query_runner = Athena({'glue': True, 'region': 'mars-east-1'})

        self.stubber.add_response('get_databases', {'DatabaseList': [{'Name': 'test1'}]}, {})
        self.stubber.add_response(
            'get_tables',
            {
                'TableList': [
                    {
                        'Name': 'view',
                        'StorageDescriptor': {
                            'Columns': [{'Name': 'sk', 'Type': 'int'}],
                            'Location': '',
                            'Compressed': False,
                            'NumberOfBuckets': 0,
                            'SerdeInfo': {},
                            'SortColumns': [],
                            'StoredAsSubDirectories': False,
                        },
                        'PartitionKeys': [],
                        'ViewOriginalText': '/* Presto View: ... */',
                        'ViewExpandedText': '/* Presto View */',
                        'TableType': 'VIRTUAL_VIEW',
                        'Parameters': {'comment': 'Presto View', 'presto_view': 'true'},
                    }
                ]
            },
            {'DatabaseName': 'test1'},
        )
        with self.stubber:
            assert query_runner.get_schema() == [{'columns': ['sk'], 'name': 'test1.view'}]
Exemplo n.º 2
0
    def test_no_storage_descriptor_table(self):
        """
        For some reason, not all Glue tables contain a "StorageDescriptor" entry.
        """
        query_runner = Athena({'glue': True, 'region': 'mars-east-1'})

        self.stubber.add_response('get_databases',
                                  {'DatabaseList': [{
                                      'Name': 'test1'
                                  }]}, {})
        self.stubber.add_response(
            'get_tables',
            {
                'TableList': [{
                    'Name': 'no_storage_descriptor_table',
                    'PartitionKeys': [],
                    'TableType': 'EXTERNAL_TABLE',
                    'Parameters': {
                        'EXTERNAL': 'TRUE'
                    },
                }]
            },
            {'DatabaseName': 'test1'},
        )
        with self.stubber:
            assert query_runner.get_schema() == []
Exemplo n.º 3
0
    def test_external_table(self):
        """Unpartitioned table crawled through a JDBC connection"""
        query_runner = Athena({"glue": True, "region": "mars-east-1"})

        self.stubber.add_response("get_databases",
                                  {"DatabaseList": [{
                                      "Name": "test1"
                                  }]}, {})
        self.stubber.add_response(
            "get_tables",
            {
                "TableList": [{
                    "Name": "jdbc_table",
                    "StorageDescriptor": {
                        "Columns": [{
                            "Name": "row_id",
                            "Type": "int"
                        }],
                        "Location": "Database.Schema.Table",
                        "Compressed": False,
                        "NumberOfBuckets": -1,
                        "SerdeInfo": {
                            "Parameters": {}
                        },
                        "BucketColumns": [],
                        "SortColumns": [],
                        "Parameters": {
                            "CrawlerSchemaDeserializerVersion": "1.0",
                            "CrawlerSchemaSerializerVersion": "1.0",
                            "UPDATED_BY_CRAWLER": "jdbc",
                            "classification": "sqlserver",
                            "compressionType": "none",
                            "connectionName": "jdbctest",
                            "typeOfData": "view",
                        },
                        "StoredAsSubDirectories": False,
                    },
                    "PartitionKeys": [],
                    "TableType": "EXTERNAL_TABLE",
                    "Parameters": {
                        "CrawlerSchemaDeserializerVersion": "1.0",
                        "CrawlerSchemaSerializerVersion": "1.0",
                        "UPDATED_BY_CRAWLER": "jdbc",
                        "classification": "sqlserver",
                        "compressionType": "none",
                        "connectionName": "jdbctest",
                        "typeOfData": "view",
                    },
                }]
            },
            {"DatabaseName": "test1"},
        )
        with self.stubber:
            assert query_runner.get_schema() == [{
                "columns": ["row_id"],
                "name": "test1.jdbc_table"
            }]
Exemplo n.º 4
0
    def test_dodgy_table_does_not_break_schema_listing(self):
        """
        For some reason, not all Glue tables contain a "PartitionKeys" entry.

        This may be a Athena Catalog to Glue catalog migration issue.
        """
        query_runner = Athena({"glue": True, "region": "mars-east-1"})

        self.stubber.add_response("get_databases",
                                  {"DatabaseList": [{
                                      "Name": "test1"
                                  }]}, {})
        self.stubber.add_response(
            "get_tables",
            {
                "TableList": [{
                    "Name": "csv",
                    "StorageDescriptor": {
                        "Columns": [{
                            "Name": "region",
                            "Type": "string"
                        }],
                        "Location": "s3://bucket/files/",
                        "InputFormat":
                        "org.apache.hadoop.mapred.TextInputFormat",
                        "Compressed": False,
                        "NumberOfBuckets": 0,
                        "SerdeInfo": {
                            "SerializationLibrary":
                            "org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe",
                            "Parameters": {
                                "field.delim": "|",
                                "skip.header.line.count": "1",
                            },
                        },
                        "SortColumns": [],
                        "StoredAsSubDirectories": False,
                    },
                    "Parameters": {
                        "classification": "csv"
                    },
                }]
            },
            {"DatabaseName": "test1"},
        )
        with self.stubber:
            assert query_runner.get_schema() == [{
                "columns": ["region"],
                "name":
                "test1.csv",
                "metadata": [{
                    "type": "string",
                    "name": "region"
                }],
            }]
Exemplo n.º 5
0
    def test_external_table(self):
        """Unpartitioned table crawled through a JDBC connection"""
        query_runner = Athena({'glue': True, 'region': 'mars-east-1'})

        self.stubber.add_response('get_databases', {'DatabaseList': [{'Name': 'test1'}]}, {})
        self.stubber.add_response(
            'get_tables',
            {
                'TableList': [
                    {
                        'Name': 'jdbc_table',
                        'StorageDescriptor': {
                            'Columns': [{'Name': 'row_id', 'Type': 'int'}],
                            'Location': 'Database.Schema.Table',
                            'Compressed': False,
                            'NumberOfBuckets': -1,
                            'SerdeInfo': {'Parameters': {}},
                            'BucketColumns': [],
                            'SortColumns': [],
                            'Parameters': {
                                'CrawlerSchemaDeserializerVersion': '1.0',
                                'CrawlerSchemaSerializerVersion': '1.0',
                                'UPDATED_BY_CRAWLER': 'jdbc',
                                'classification': 'sqlserver',
                                'compressionType': 'none',
                                'connectionName': 'jdbctest',
                                'typeOfData': 'view',
                            },
                            'StoredAsSubDirectories': False,
                        },
                        'PartitionKeys': [],
                        'TableType': 'EXTERNAL_TABLE',
                        'Parameters': {
                            'CrawlerSchemaDeserializerVersion': '1.0',
                            'CrawlerSchemaSerializerVersion': '1.0',
                            'UPDATED_BY_CRAWLER': 'jdbc',
                            'classification': 'sqlserver',
                            'compressionType': 'none',
                            'connectionName': 'jdbctest',
                            'typeOfData': 'view',
                        },
                    }
                ]
            },
            {'DatabaseName': 'test1'},
        )
        with self.stubber:
            assert query_runner.get_schema() == [{'columns': ['row_id'], 'name': 'test1.jdbc_table'}]
Exemplo n.º 6
0
    def test_view(self):
        query_runner = Athena({"glue": True, "region": "mars-east-1"})

        self.stubber.add_response("get_databases",
                                  {"DatabaseList": [{
                                      "Name": "test1"
                                  }]}, {})
        self.stubber.add_response(
            "get_tables",
            {
                "TableList": [{
                    "Name": "view",
                    "StorageDescriptor": {
                        "Columns": [{
                            "Name": "sk",
                            "Type": "int"
                        }],
                        "Location": "",
                        "Compressed": False,
                        "NumberOfBuckets": 0,
                        "SerdeInfo": {},
                        "SortColumns": [],
                        "StoredAsSubDirectories": False,
                    },
                    "PartitionKeys": [],
                    "ViewOriginalText": "/* Presto View: ... */",
                    "ViewExpandedText": "/* Presto View */",
                    "TableType": "VIRTUAL_VIEW",
                    "Parameters": {
                        "comment": "Presto View",
                        "presto_view": "true"
                    },
                }]
            },
            {"DatabaseName": "test1"},
        )
        with self.stubber:
            assert query_runner.get_schema() == [{
                "columns": ["sk"],
                "name":
                "test1.view",
                "metadata": [{
                    "type": "int",
                    "name": "sk"
                }],
            }]
Exemplo n.º 7
0
    def test_partitioned_table(self):
        """
        Partitioned table as created by a GlueContext
        """

        query_runner = Athena({'glue': True, 'region': 'mars-east-1'})

        self.stubber.add_response('get_databases', {'DatabaseList': [{'Name': 'test1'}]}, {})
        self.stubber.add_response(
            'get_tables',
            {
                'TableList': [
                    {
                        'Name': 'partitioned_table',
                        'StorageDescriptor': {
                            'Columns': [{'Name': 'sk', 'Type': 'int'}],
                            'Location': 's3://bucket/prefix',
                            'InputFormat': 'org.apache.hadoop.mapred.TextInputFormat',
                            'OutputFormat': 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat',
                            'Compressed': False,
                            'NumberOfBuckets': -1,
                            'SerdeInfo': {
                                'SerializationLibrary': 'org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe',
                                'Parameters': {'serialization.format': '1'},
                            },
                            'BucketColumns': [],
                            'SortColumns': [],
                            'Parameters': {},
                            'SkewedInfo': {
                                'SkewedColumnNames': [],
                                'SkewedColumnValues': [],
                                'SkewedColumnValueLocationMaps': {},
                            },
                            'StoredAsSubDirectories': False,
                        },
                        'PartitionKeys': [{'Name': 'category', 'Type': 'int'}],
                        'TableType': 'EXTERNAL_TABLE',
                        'Parameters': {'EXTERNAL': 'TRUE', 'transient_lastDdlTime': '1537505313'},
                    }
                ]
            },
            {'DatabaseName': 'test1'},
        )
        with self.stubber:
            assert query_runner.get_schema() == [{'columns': ['sk', 'category'], 'name': 'test1.partitioned_table'}]
Exemplo n.º 8
0
    def test_dodgy_table_does_not_break_schema_listing(self):
        """
        For some reason, not all Glue tables contain a "PartitionKeys" entry.

        This may be a Athena Catalog to Glue catalog migration issue.
        """
        query_runner = Athena({'glue': True, 'region': 'mars-east-1'})

        self.stubber.add_response('get_databases', {'DatabaseList': [{'Name': 'test1'}]}, {})
        self.stubber.add_response(
            'get_tables',
            {
                'TableList': [
                    {
                        'Name': 'csv',
                        'StorageDescriptor': {
                            'Columns': [{'Name': 'region', 'Type': 'string'}],
                            'Location': 's3://bucket/files/',
                            'InputFormat': 'org.apache.hadoop.mapred.TextInputFormat',
                            'Compressed': False,
                            'NumberOfBuckets': 0,
                            'SerdeInfo': {
                                'SerializationLibrary': 'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe',
                                'Parameters': {'field.delim': '|', 'skip.header.line.count': '1'},
                            },
                            'SortColumns': [],
                            'StoredAsSubDirectories': False,
                        },
                        'Parameters': {'classification': 'csv'},
                    }
                ]
            },
            {'DatabaseName': 'test1'},
        )
        with self.stubber:
            assert query_runner.get_schema() == [{'columns': ['region'], 'name': 'test1.csv'}]
Exemplo n.º 9
0
 def setUp(self):
     self.query_runners = [{
         'instance': Presto({}),
         'mock_location': 'presto.Presto'
     }, {
         'instance': Athena({}),
         'mock_location': 'athena.Athena'
     }, {
         'instance': Mysql({'db': None}),
         'mock_location': 'mysql.Mysql'
     }, {
         'instance': PostgreSQL({}),
         'mock_location': 'pg.PostgreSQL'
     }, {
         'instance': Redshift({}),
         'mock_location': 'pg.Redshift'
     }]
Exemplo n.º 10
0
 def setUp(self):
     self.query_runners = [
         {
             "instance": Presto({}),
             "mock_location": "presto.Presto"
         },
         {
             "instance": Athena({}),
             "mock_location": "athena.Athena"
         },
         {
             "instance": Mysql({"db": None}),
             "mock_location": "mysql.Mysql"
         },
         {
             "instance": PostgreSQL({}),
             "mock_location": "pg.PostgreSQL"
         },
         {
             "instance": Redshift({}),
             "mock_location": "pg.Redshift"
         },
     ]
Exemplo n.º 11
0
    def test_partitioned_table(self):
        """
        Partitioned table as created by a GlueContext
        """

        query_runner = Athena({"glue": True, "region": "mars-east-1"})

        self.stubber.add_response("get_databases",
                                  {"DatabaseList": [{
                                      "Name": "test1"
                                  }]}, {})
        self.stubber.add_response(
            "get_tables",
            {
                "TableList":
                [{
                    "Name": "partitioned_table",
                    "StorageDescriptor": {
                        "Columns": [{
                            "Name": "sk",
                            "Type": "int"
                        }],
                        "Location": "s3://bucket/prefix",
                        "InputFormat":
                        "org.apache.hadoop.mapred.TextInputFormat",
                        "OutputFormat":
                        "org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat",
                        "Compressed": False,
                        "NumberOfBuckets": -1,
                        "SerdeInfo": {
                            "SerializationLibrary":
                            "org.apache.hadoop.hive.ql.io.parquet.serde.ParquetHiveSerDe",
                            "Parameters": {
                                "serialization.format": "1"
                            },
                        },
                        "BucketColumns": [],
                        "SortColumns": [],
                        "Parameters": {},
                        "SkewedInfo": {
                            "SkewedColumnNames": [],
                            "SkewedColumnValues": [],
                            "SkewedColumnValueLocationMaps": {},
                        },
                        "StoredAsSubDirectories": False,
                    },
                    "PartitionKeys": [{
                        "Name": "category",
                        "Type": "int"
                    }],
                    "TableType": "EXTERNAL_TABLE",
                    "Parameters": {
                        "EXTERNAL": "TRUE",
                        "transient_lastDdlTime": "1537505313",
                    },
                }]
            },
            {"DatabaseName": "test1"},
        )
        with self.stubber:
            assert query_runner.get_schema() == [{
                "columns": ["sk", "category"],
                "name":
                "test1.partitioned_table",
                "metadata": [
                    {
                        "type": "int",
                        "name": "sk"
                    },
                    {
                        "type": "int",
                        "name": "category"
                    },
                ],
            }]