def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'best', assume_aligned: bool = False, nontemporal: Union[bool, Container[Union[str, Field]]] = False, assume_inner_stride_one: bool = False, assume_sufficient_line_padding: bool = True): """Explicit vectorization using SIMD vectorization via intrinsics. Args: kernel_ast: abstract syntax tree (KernelFunction node) instruction_set: one of the supported vector instruction sets, currently ('sse', 'avx' and 'avx512') assume_aligned: assume that the first inner cell of each line is aligned. If false, only unaligned-loads are used. If true, some of the loads are assumed to be from aligned memory addresses. For example if x is the fastest coordinate, the access to center can be fetched via an aligned-load instruction, for the west or east accesses potentially slower unaligend-load instructions have to be used. nontemporal: a container of fields or field names for which nontemporal (streaming) stores are used. If true, nontemporal access instructions are used for all fields. assume_inner_stride_one: kernels with non-constant inner loop bound and strides can not be vectorized since the inner loop stride is a runtime variable and thus might not be always 1. If this parameter is set to true, the inner stride is assumed to be always one. This has to be ensured at runtime! assume_sufficient_line_padding: if True and assume_inner_stride_one, no tail loop is created but loop is extended by at most (vector_width-1) elements assumes that at the end of each line there is enough padding with dummy data depending on the access pattern there might be additional padding required at the end of the array """ if instruction_set == 'best': if get_supported_instruction_sets(): instruction_set = get_supported_instruction_sets()[-1] else: instruction_set = 'avx' if instruction_set is None: return all_fields = kernel_ast.fields_accessed if nontemporal is None or nontemporal is False: nontemporal = {} elif nontemporal is True: nontemporal = all_fields if assume_inner_stride_one: replace_inner_stride_with_one(kernel_ast) field_float_dtypes = set(f.dtype for f in all_fields if f.dtype.is_float()) if len(field_float_dtypes) != 1: raise NotImplementedError("Cannot vectorize kernels that contain accesses " "to differently typed floating point fields") float_size = field_float_dtypes.pop().numpy_dtype.itemsize assert float_size in (8, 4) default_float_type = 'double' if float_size == 8 else 'float' vector_is = get_vector_instruction_set(default_float_type, instruction_set=instruction_set) vector_width = vector_is['width'] kernel_ast.instruction_set = vector_is strided = 'storeS' in vector_is and 'loadS' in vector_is keep_loop_stop = '{loop_stop}' in vector_is['storeA' if assume_aligned else 'storeU'] vectorize_inner_loops_and_adapt_load_stores(kernel_ast, vector_width, assume_aligned, nontemporal, strided, keep_loop_stop, assume_sufficient_line_padding) insert_vector_casts(kernel_ast, default_float_type)
def default_create_kernel_parameters(generation_context, params): default_dtype = "float64" if generation_context.double_accuracy else 'float32' if generation_context.optimize_for_localhost: supported_instruction_sets = get_supported_instruction_sets() if supported_instruction_sets: default_vec_is = get_supported_instruction_sets()[-1] else: # if cpuinfo package is not installed default_vec_is = 'sse' else: default_vec_is = None params['target'] = params.get('target', 'cpu') params['data_type'] = params.get('data_type', default_dtype) params['cpu_openmp'] = params.get('cpu_openmp', generation_context.openmp) params['cpu_vectorize_info'] = params.get('cpu_vectorize_info', {}) vec = params['cpu_vectorize_info'] vec['instruction_set'] = vec.get('instruction_set', default_vec_is) vec['assume_inner_stride_one'] = True vec['assume_aligned'] = vec.get('assume_aligned', False) vec['nontemporal'] = vec.get('nontemporal', False) return params
def test_schaefer_turek(): opt = { 'vectorization': { 'instruction_set': get_supported_instruction_sets()[-1], 'assume_aligned': True }, 'openmp': 2 } sc_2d_1 = schaefer_turek_2d(30, max_lattice_velocity=0.08, optimization=opt) sc_2d_1.run(30000) result = evaluate_static_quantities(sc_2d_1) assert 5.5 < result['c_D'] < 5.8 assert 0.117 < result['DeltaP'] < 0.118
def test_lbm_vectorization_short(): print("Computing reference solutions") size1 = (64, 32) relaxation_rate = 1.8 ldc1_ref = create_lid_driven_cavity(size1, relaxation_rate=relaxation_rate) ldc1_ref.run(10) lbm_config = LBMConfig(relaxation_rate=relaxation_rate) config = ps.CreateKernelConfig( cpu_vectorize_info={ 'instruction_set': get_supported_instruction_sets()[-1], 'assume_aligned': True, 'nontemporal': True, 'assume_inner_stride_one': True, 'assume_sufficient_line_padding': False, }) ldc1 = create_lid_driven_cavity(size1, lbm_config=lbm_config, config=config, fixed_loop_sizes=False) ldc1.run(10)
def test_hardware_query(): instruction_sets = get_supported_instruction_sets() assert 'sse' in instruction_sets
def aligned_empty(shape, byte_alignment=True, dtype=np.float64, byte_offset=0, order='C', align_inner_coordinate=True): """ Creates an aligned empty numpy array Args: shape: size of the array byte_alignment: alignment in bytes, for the start address of the array holds (a % byte_alignment) == 0 By default, use the maximum required by the CPU (or 512 bits if this cannot be detected). When 'cacheline' is specified, the size of a cache line is used. dtype: numpy data type byte_offset: offset in bytes for position that should be aligned i.e. (a+byte_offset) % byte_alignment == 0 typically used to align first inner cell instead of ghost layer order: storage linearization order align_inner_coordinate: if True, the start of the innermost coordinate lines are aligned as well """ if byte_alignment is True or byte_alignment == 'cacheline': from pystencils.backends.simd_instruction_sets import ( get_supported_instruction_sets, get_cacheline_size, get_vector_instruction_set) type_name = BasicType.numpy_name_to_c(np.dtype(dtype).name) instruction_sets = get_supported_instruction_sets() if instruction_sets is None: byte_alignment = 64 elif byte_alignment == 'cacheline': cacheline_sizes = [ get_cacheline_size(is_name) for is_name in instruction_sets ] if all([s is None for s in cacheline_sizes]): widths = [ get_vector_instruction_set(type_name, is_name)['width'] * np.dtype(dtype).itemsize for is_name in instruction_sets if type( get_vector_instruction_set(type_name, is_name) ['width']) is int ] byte_alignment = 64 if all([s is None for s in widths]) else max(widths) else: byte_alignment = max( [s for s in cacheline_sizes if s is not None]) elif not any([ type(get_vector_instruction_set(type_name, is_name)['width']) is int for is_name in instruction_sets ]): byte_alignment = 64 else: byte_alignment = max([ get_vector_instruction_set(type_name, is_name)['width'] * np.dtype(dtype).itemsize for is_name in instruction_sets if type(get_vector_instruction_set(type_name, is_name) ['width']) is int ]) if (not align_inner_coordinate) or (not hasattr(shape, '__len__')): size = np.prod(shape) d = np.dtype(dtype) # 2 * byte_alignment instead of 1 * byte_alignment to have slack in the end such that # vectorized loops can access vector_width elements further and don't require a tail loop tmp = np.empty(size * d.itemsize + 2 * byte_alignment, dtype=np.uint8) address = tmp.__array_interface__['data'][0] offset = (byte_alignment - (address + byte_offset) % byte_alignment) % byte_alignment return tmp[offset:offset + size * d.itemsize].view(dtype=d).reshape( shape, order=order) else: if order == 'C': dim0_size = shape[-1] dim0 = -1 dim1_size = np.prod(shape[:-1]) else: dim0_size = shape[0] dim0 = 0 dim1_size = np.prod(shape[1:]) d = np.dtype(dtype) assert byte_alignment >= d.itemsize and byte_alignment % d.itemsize == 0 padding = (byte_alignment - ( (dim0_size * d.itemsize) % byte_alignment)) % byte_alignment size = dim1_size * padding + np.prod(shape) * d.itemsize tmp = aligned_empty(size, byte_alignment=byte_alignment, dtype=np.uint8, byte_offset=byte_offset) tmp = tmp.view(dtype=dtype) shape_in_bytes = [i for i in shape] shape_in_bytes[dim0] = dim0_size + padding // d.itemsize tmp = tmp.reshape(shape_in_bytes, order=order) if tmp.flags['C_CONTIGUOUS']: tmp = tmp[..., :shape[-1]] else: tmp = tmp[:shape[0], ...] return tmp
import numpy as np import sympy as sp import pytest import pystencils as ps from pystencils.astnodes import Block, Conditional from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set from pystencils.enums import Target from pystencils.cpu.vectorization import vec_all, vec_any supported_instruction_sets = get_supported_instruction_sets( ) if get_supported_instruction_sets() else [] @pytest.mark.parametrize('instruction_set', supported_instruction_sets) @pytest.mark.parametrize('dtype', ('float', 'double')) def test_vec_any(instruction_set, dtype): if instruction_set in ['sve', 'rvv']: width = 4 # we don't know the actual value else: width = get_vector_instruction_set(dtype, instruction_set)['width'] data_arr = np.zeros((4 * width, 4 * width), dtype=np.float64 if dtype == 'double' else np.float32) data_arr[3:9, 1:3 * width - 1] = 1.0 data = ps.fields(f"data: {dtype}[2D]", data=data_arr) c = [ ps.Assignment(sp.Symbol("t1"), vec_any(data.center() > 0.0)), Conditional(vec_any(data.center() > 0.0), Block([ps.Assignment(data.center(), 2.0)]))
import numpy as np import pytest import pystencils as ps from pystencils.rng import PhiloxFourFloats, PhiloxTwoDoubles, AESNIFourFloats, AESNITwoDoubles, random_symbol from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets from pystencils.cpu.cpujit import get_compiler_config from pystencils.data_types import TypedSymbol from pystencils.enums import Target RNGs = {('philox', 'float'): PhiloxFourFloats, ('philox', 'double'): PhiloxTwoDoubles, ('aesni', 'float'): AESNIFourFloats, ('aesni', 'double'): AESNITwoDoubles} instruction_sets = get_supported_instruction_sets() if get_compiler_config()['os'] == 'windows': # skip instruction sets supported by the CPU but not by the compiler if 'avx' in instruction_sets and ('/arch:avx2' not in get_compiler_config()['flags'].lower() and '/arch:avx512' not in get_compiler_config()['flags'].lower()): instruction_sets.remove('avx') if 'avx512' in instruction_sets and '/arch:avx512' not in get_compiler_config()['flags'].lower(): instruction_sets.remove('avx512') @pytest.mark.parametrize('target,rng', ( (Target.CPU, 'philox'), (Target.CPU, 'aesni'), (Target.GPU, 'philox'))) @pytest.mark.parametrize('precision', ('float', 'double')) @pytest.mark.parametrize('dtype', ('float', 'double')) def test_rng(target, rng, precision, dtype, t=124, offsets=(0, 0), keys=(0, 0), offset_values=None): if target == Target.GPU: pytest.importorskip('pycuda')
import numpy as np import pytest import pystencils as ps from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets from lbmpy.scenarios import create_lid_driven_cavity from lbmpy.creationfunctions import LBMConfig, LBMOptimisation @pytest.mark.skipif(not get_supported_instruction_sets(), reason='cannot detect CPU instruction set') def test_lbm_vectorization_short(): print("Computing reference solutions") size1 = (64, 32) relaxation_rate = 1.8 ldc1_ref = create_lid_driven_cavity(size1, relaxation_rate=relaxation_rate) ldc1_ref.run(10) lbm_config = LBMConfig(relaxation_rate=relaxation_rate) config = ps.CreateKernelConfig( cpu_vectorize_info={ 'instruction_set': get_supported_instruction_sets()[-1], 'assume_aligned': True, 'nontemporal': True, 'assume_inner_stride_one': True, 'assume_sufficient_line_padding': False, }) ldc1 = create_lid_driven_cavity(size1, lbm_config=lbm_config, config=config,
def long_run(steady=True, **kwargs): if steady: # scenario 2D-1 in the paper sc = schaefer_turek_2d(60, max_lattice_velocity=0.05, **kwargs) else: # Scenario 2D-2 (unsteady) sc = schaefer_turek_2d(40, u_max=1.5, max_lattice_velocity=0.01) for i in range(100): sc.run(10000) res = evaluate_static_quantities(sc) print(res) import lbmpy.plot as plt plt.vector_field_magnitude(sc.velocity[:, :]) plt.show() @pytest.mark.skipif(not get_supported_instruction_sets(), reason='cannot detect CPU instruction set') def test_schaefer_turek(): opt = { 'vectorization': { 'instruction_set': get_supported_instruction_sets()[-1], 'assume_aligned': True }, 'openmp': 2 } sc_2d_1 = schaefer_turek_2d(30, max_lattice_velocity=0.08, optimization=opt) sc_2d_1.run(30000) result = evaluate_static_quantities(sc_2d_1) assert 5.5 < result['c_D'] < 5.8