def create_exec_option_dimension_from_dict(exec_option_dimensions): """ Builds a query exec option test dimension Exhaustively goes through all combinations of the given query option values. For each combination create an exec option dictionary and add it as a value in the exec option test dimension. Each dictionary can then be passed via Beeswax to control Impala query execution behavior. TODO: In the future we could generate these values using pairwise to reduce total execution time. """ # Generate the cross product (all combinations) of the exec options specified. Then # store them in exec_option dictionary format. keys = sorted(exec_option_dimensions) combinations = product(*(exec_option_dimensions[name] for name in keys)) exec_option_dimension_values = [ dict(zip(keys, prod)) for prod in combinations ] # Build a test vector out of it return ImpalaTestDimension('exec_option', *exec_option_dimension_values)
def add_test_dimensions(cls): super(TestInsertQueries, cls).add_test_dimensions() # Fix the exec_option vector to have a single value. This is needed should we decide # to run the insert tests in parallel (otherwise there will be two tests inserting # into the same table at the same time for the same file format). # TODO: When we do decide to run these tests in parallel we could create unique temp # tables for each test case to resolve the concurrency problems. if cls.exploration_strategy() == 'core': cls.ImpalaTestMatrix.add_dimension( create_exec_option_dimension(cluster_sizes=[0], disable_codegen_options=[False], batch_sizes=[0], sync_ddl=[0])) cls.ImpalaTestMatrix.add_dimension( create_uncompressed_text_dimension(cls.get_workload())) else: cls.ImpalaTestMatrix.add_dimension( create_exec_option_dimension(cluster_sizes=[0], disable_codegen_options=[False], batch_sizes=[0, 1, 16], sync_ddl=[0, 1])) cls.ImpalaTestMatrix.add_dimension( ImpalaTestDimension("compression_codec", *PARQUET_CODECS)) # Insert is currently only supported for text and parquet # For parquet, we want to iterate through all the compression codecs # TODO: each column in parquet can have a different codec. We could # test all the codecs in one table/file with some additional flags. cls.ImpalaTestMatrix.add_constraint(lambda v:\ v.get_value('table_format').file_format == 'parquet' or \ (v.get_value('table_format').file_format == 'text' and \ v.get_value('compression_codec') == 'none')) cls.ImpalaTestMatrix.add_constraint(lambda v:\ v.get_value('table_format').compression_codec == 'none') # Only test other batch sizes for uncompressed parquet to keep the execution time # within reasonable bounds. cls.ImpalaTestMatrix.add_constraint(lambda v:\ v.get_value('exec_option')['batch_size'] == 0 or \ (v.get_value('table_format').file_format == 'parquet' and \ v.get_value('compression_codec') == 'none'))
def create_table_info_dimension(cls, exploration_strategy): # If the user has specified a specific set of table formats to run against, then # use those. Otherwise, load from the workload test vectors. if pytest.config.option.table_formats: table_formats = list() for tf in pytest.config.option.table_formats.split(','): dataset = get_dataset_from_workload(cls.get_workload()) table_formats.append( TableFormatInfo.create_from_string(dataset, tf)) tf_dimensions = ImpalaTestDimension('table_format', *table_formats) else: tf_dimensions = load_table_info_dimension(cls.get_workload(), exploration_strategy) # If 'skip_hbase' is specified or the filesystem is isilon, s3 or local, we don't # need the hbase dimension. if pytest.config.option.skip_hbase or TARGET_FILESYSTEM.lower() \ in ['s3', 'isilon', 'local', 'abfs', 'adls']: for tf_dimension in tf_dimensions: if tf_dimension.value.file_format == "hbase": tf_dimensions.remove(tf_dimension) break return tf_dimensions
def load_table_info_dimension(workload_name, exploration_strategy, file_formats=None, compression_codecs=None): """Loads test vector corresponding to the given workload and exploration strategy""" test_vector_file = os.path.join( WORKLOAD_DIR, workload_name, '%s_%s.csv' % (workload_name, exploration_strategy)) if not os.path.isfile(test_vector_file): raise RuntimeError, 'Vector file not found: ' + test_vector_file vector_values = [] with open(test_vector_file, 'rb') as vector_file: for line in vector_file.readlines(): if line.strip().startswith('#'): continue # Extract each test vector and add them to a dictionary vals = dict((key.strip(), value.strip()) for key, value in\ (item.split(':') for item in line.split(','))) # Skip Kudu if Kudu is not supported (IMPALA-4287). if os.environ['KUDU_IS_SUPPORTED'] != 'true' and vals[ 'file_format'] == 'kudu': continue # If only loading specific file formats skip anything that doesn't match if file_formats is not None and vals[ 'file_format'] not in file_formats: continue if compression_codecs is not None and\ vals['compression_codec'] not in compression_codecs: continue vector_values.append(TableFormatInfo(**vals)) return ImpalaTestDimension('table_format', *vector_values)
def add_test_dimensions(cls): super(TestCancellation, cls).add_test_dimensions() cls.ImpalaTestMatrix.add_dimension( ImpalaTestDimension('query', *QUERIES.keys())) cls.ImpalaTestMatrix.add_dimension( ImpalaTestDimension('query_type', *QUERY_TYPE)) cls.ImpalaTestMatrix.add_dimension( ImpalaTestDimension('cancel_delay', *CANCEL_DELAY_IN_SECONDS)) cls.ImpalaTestMatrix.add_dimension( ImpalaTestDimension('wait_action', *WAIT_ACTIONS)) cls.ImpalaTestMatrix.add_dimension( ImpalaTestDimension('fail_rpc_action', *FAIL_RPC_ACTIONS)) cls.ImpalaTestMatrix.add_dimension( ImpalaTestDimension('join_before_close', *JOIN_BEFORE_CLOSE)) cls.ImpalaTestMatrix.add_dimension( ImpalaTestDimension('buffer_pool_limit', 0)) cls.ImpalaTestMatrix.add_constraint( lambda v: v.get_value('query_type') != 'CTAS' or (\ v.get_value('table_format').file_format in ['text', 'parquet', 'kudu'] and\ v.get_value('table_format').compression_codec == 'none')) cls.ImpalaTestMatrix.add_constraint( lambda v: v.get_value('exec_option')['batch_size'] == 0) # Ignore 'compute stats' queries for the CTAS query type. cls.ImpalaTestMatrix.add_constraint( lambda v: not (v.get_value('query_type') == 'CTAS' and v.get_value( 'query').startswith('compute stats'))) # Ignore CTAS on Kudu if there is no PRIMARY KEY specified. cls.ImpalaTestMatrix.add_constraint(lambda v: not ( v.get_value('query_type') == 'CTAS' and v.get_value('table_format') .file_format == 'kudu' and QUERIES[v.get_value('query')] is None)) # tpch tables are not generated for hbase as the data loading takes a very long time. # TODO: Add cancellation tests for hbase. cls.ImpalaTestMatrix.add_constraint(lambda v:\ v.get_value('table_format').file_format != 'hbase') if cls.exploration_strategy() != 'core': NUM_CANCELATION_ITERATIONS = 3
def add_test_dimensions(cls): super(TestScanRangeLengths, cls).add_test_dimensions() cls.ImpalaTestMatrix.add_dimension( ImpalaTestDimension('max_scan_range_length', *MAX_SCAN_RANGE_LENGTHS))
def add_test_dimensions(cls): super(TestWideTable, cls).add_test_dimensions() cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension("num_cols", *cls.NUM_COLS)) # To cut down on test execution time, only run in exhaustive. if cls.exploration_strategy() != 'exhaustive': cls.ImpalaTestMatrix.add_constraint(lambda v: False)
def create_beeswax_dimension(): return ImpalaTestDimension('protocol', 'beeswax')
def create_avro_snappy_dimension(workload): dataset = get_dataset_from_workload(workload) return ImpalaTestDimension( 'table_format', TableFormatInfo.create_from_string(dataset, 'avro/snap/block'))
def add_test_dimensions(cls): cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension( 'table_format_and_file_extension', *[('parquet', '.parq'), ('textfile', '.txt')]))
def create_client_protocol_no_strict_dimension(): return ImpalaTestDimension('strict_hs2_protocol', False)
def add_test_dimensions(cls): super(TestMtDop, cls).add_test_dimensions() cls.ImpalaTestMatrix.add_dimension( ImpalaTestDimension('mt_dop', *MT_DOP_VALUES))
def create_parquet_dimension(workload): dataset = get_dataset_from_workload(workload) return ImpalaTestDimension( 'table_format', TableFormatInfo.create_from_string(dataset, 'parquet/none'))
def add_test_dimensions(cls): super(TestMtDopAdmissionSlots, cls).add_test_dimensions() cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension('mt_dop', 4)) cls.ImpalaTestMatrix.add_constraint( lambda v: v.get_value('table_format').file_format == 'parquet')
def add_test_dimensions(cls): super(TestNestedTypes, cls).add_test_dimensions() cls.ImpalaTestMatrix.add_constraint(lambda v: v.get_value( 'table_format').file_format in ['parquet', 'orc']) cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension('mt_dop', 0, 2))
def add_test_dimensions(cls): super(TestMaxNestingDepth, cls).add_test_dimensions() cls.ImpalaTestMatrix.add_constraint(lambda v: v.get_value('table_format').file_format in ['parquet', 'orc']) cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension('orc_schema_resolution', 0, 1)) cls.ImpalaTestMatrix.add_constraint(orc_schema_resolution_constraint)
def create_beeswax_hs2_hs2http_dimension(): return ImpalaTestDimension('protocol', 'beeswax', 'hs2', 'hs2-http')
def add_test_dimensions(cls): super(TestHdfsFileMods, cls).add_test_dimensions() cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension('modification_type',\ *MODIFICATION_TYPES)) cls.ImpalaTestMatrix.add_constraint(cls.file_format_constraint)
def create_uncompressed_text_dimension(workload): dataset = get_dataset_from_workload(workload) return ImpalaTestDimension( 'table_format', TableFormatInfo.create_from_string(dataset, 'text/none'))
def add_test_dimensions(cls): super(TestHashJoinTimer, cls).add_test_dimensions() cls.ImpalaTestMatrix.add_dimension( ImpalaTestDimension('test cases', *cls.TEST_CASES)) cls.ImpalaTestMatrix.add_constraint( lambda v: cls.__is_valid_test_vector(v))
def add_test_dimensions(cls): super(TestRowsAvailability, cls).add_test_dimensions() cls.ImpalaTestMatrix.add_dimension( ImpalaTestDimension('query', *cls.QUERIES)) cls.ImpalaTestMatrix.add_constraint( lambda v: cls.__is_valid_test_vector(v))
def add_test_dimensions(cls): super(TestParquetArrayEncodings, cls).add_test_dimensions() cls.ImpalaTestMatrix.add_dimension(ImpalaTestDimension( 'parquet_array_resolution', *TestParquetArrayEncodings.ARRAY_RESOLUTION_POLICIES)) cls.ImpalaTestMatrix.add_constraint(lambda v: v.get_value('table_format').file_format == 'parquet')
def add_test_dimensions(cls): super(TestMtDopParquet, cls).add_test_dimensions() cls.ImpalaTestMatrix.add_dimension( ImpalaTestDimension('mt_dop', *MT_DOP_VALUES)) cls.ImpalaTestMatrix.add_constraint( lambda v: v.get_value('table_format').file_format == 'parquet')
def create_orc_dimension(workload): dataset = get_dataset_from_workload(workload) return ImpalaTestDimension('table_format', TableFormatInfo.create_from_string(dataset, 'orc/def'))