def create_test_database(self, unique_database): create_stmt = 'create table {db}.{tbl} ( '\ ' int8_col TINYINT, '\ ' int16_col SMALLINT, '\ ' int32_col INT, '\ ' int64_col BIGINT, '\ ' float_col FLOAT, '\ ' double_col DOUBLE, '\ ' string_col STRING, '\ ' char_col VARCHAR(3) '\ ') '\ 'stored as parquet ' create_table_and_copy_files( self.client, create_stmt, unique_database, 'parquet_bloom_filter', ['testdata/data/parquet-bloom-filtering.parquet'])
def test_deprecated_stats(self, vector, unique_database): """Test that reading parquet files with statistics with deprecated 'min'/'max' fields works correctly. The statistics will be used for known-good types (boolean, integral, float) and will be ignored for all other types (string, decimal, timestamp).""" # We use CTAS instead of "create table like" to convert the partition columns into # normal table columns. create_table_and_copy_files(self.client, 'create table {db}.{tbl} stored as parquet ' 'as select * from functional.alltypessmall ' 'limit 0', unique_database, 'deprecated_stats', ['testdata/data/deprecated_statistics.parquet']) # The test makes assumptions about the number of row groups that are processed and # skipped inside a fragment, so we ensure that the tests run in a single fragment. vector.get_value('exec_option')['num_nodes'] = 1 self.run_test_case('QueryTest/parquet-deprecated-stats', vector, unique_database)
def test_fileformat_support(self, vector, unique_database): """ Test that scanning and writing DATE is supported for text tables only.""" # This test specifies databases and locations explicitly. No need to execute it for # anything other than text fileformat on HDFS. if vector.get_value('table_format').file_format != 'text': pytest.skip() # Parquet table with date column TABLE_NAME = "parquet_date_tbl" CREATE_SQL = "CREATE TABLE {0}.{1} (date_col DATE) STORED AS PARQUET".format( unique_database, TABLE_NAME) create_table_and_copy_files(self.client, CREATE_SQL, unique_database, TABLE_NAME, ["/testdata/data/date_tbl.parquet"]) # Avro table with date column TABLE_NAME = "avro_date_tbl" CREATE_SQL = "CREATE TABLE {0}.{1} (date_col DATE) STORED AS AVRO".format( unique_database, TABLE_NAME) create_table_and_copy_files(self.client, CREATE_SQL, unique_database, TABLE_NAME, ["/testdata/data/date_tbl.avro"]) # Partitioned table with parquet and avro partitions TABLE_NAME = "date_tbl" CREATE_SQL = """CREATE TABLE {0}.{1} (date_col DATE) PARTITIONED BY (date_part DATE)""".format(unique_database, TABLE_NAME) self.client.execute(CREATE_SQL) # Add partitions ADD_PART_SQL = """ALTER TABLE {0}.{1} ADD PARTITION (date_part='1899-12-31') LOCATION '/test-warehouse/{0}.db/parquet_date_tbl' PARTITION (date_part='1999-12-31') LOCATION '/test-warehouse/{0}.db/avro_date_tbl' """.format(unique_database, TABLE_NAME) self.client.execute(ADD_PART_SQL) # Parquet fileformat SET_PART_FF_SQL = """ALTER TABLE {0}.{1} PARTITION (date_part='1899-12-31') SET FILEFORMAT PARQUET""".format(unique_database, TABLE_NAME) self.client.execute(SET_PART_FF_SQL) # Avro fileformat SET_PART_FF_SQL = """ALTER TABLE {0}.{1} PARTITION (date_part='1999-12-31') SET FILEFORMAT AVRO""".format(unique_database, TABLE_NAME) self.client.execute(SET_PART_FF_SQL) # Test scanning/writing tables with different fileformats. self.run_test_case('QueryTest/date-fileformat-support', vector, use_db=unique_database)
def _create_test_table_from_file(self, db_name, filename): create_table_and_copy_files(self.client, self.create_stmt, db_name, 'parquet_bloom_filter', [filename])