Exemplo n.º 1
0
def vectorize(kernel_ast: ast.KernelFunction, instruction_set: str = 'best',
              assume_aligned: bool = False, nontemporal: Union[bool, Container[Union[str, Field]]] = False,
              assume_inner_stride_one: bool = False, assume_sufficient_line_padding: bool = True):
    """Explicit vectorization using SIMD vectorization via intrinsics.

    Args:
        kernel_ast: abstract syntax tree (KernelFunction node)
        instruction_set: one of the supported vector instruction sets, currently ('sse', 'avx' and 'avx512')
        assume_aligned: assume that the first inner cell of each line is aligned. If false, only unaligned-loads are
                        used. If true, some of the loads are assumed to be from aligned memory addresses.
                        For example if x is the fastest coordinate, the access to center can be fetched via an
                        aligned-load instruction, for the west or east accesses potentially slower unaligend-load
                        instructions have to be used.
        nontemporal: a container of fields or field names for which nontemporal (streaming) stores are used.
                     If true, nontemporal access instructions are used for all fields.
        assume_inner_stride_one: kernels with non-constant inner loop bound and strides can not be vectorized since
                                 the inner loop stride is a runtime variable and thus might not be always 1.
                                 If this parameter is set to true, the inner stride is assumed to be always one.
                                 This has to be ensured at runtime!
        assume_sufficient_line_padding: if True and assume_inner_stride_one, no tail loop is created but loop is
                                        extended by at most (vector_width-1) elements
                                        assumes that at the end of each line there is enough padding with dummy data
                                        depending on the access pattern there might be additional padding
                                        required at the end of the array
    """
    if instruction_set == 'best':
        if get_supported_instruction_sets():
            instruction_set = get_supported_instruction_sets()[-1]
        else:
            instruction_set = 'avx'
    if instruction_set is None:
        return

    all_fields = kernel_ast.fields_accessed
    if nontemporal is None or nontemporal is False:
        nontemporal = {}
    elif nontemporal is True:
        nontemporal = all_fields

    if assume_inner_stride_one:
        replace_inner_stride_with_one(kernel_ast)

    field_float_dtypes = set(f.dtype for f in all_fields if f.dtype.is_float())
    if len(field_float_dtypes) != 1:
        raise NotImplementedError("Cannot vectorize kernels that contain accesses "
                                  "to differently typed floating point fields")
    float_size = field_float_dtypes.pop().numpy_dtype.itemsize
    assert float_size in (8, 4)
    default_float_type = 'double' if float_size == 8 else 'float'
    vector_is = get_vector_instruction_set(default_float_type, instruction_set=instruction_set)
    vector_width = vector_is['width']
    kernel_ast.instruction_set = vector_is

    strided = 'storeS' in vector_is and 'loadS' in vector_is
    keep_loop_stop = '{loop_stop}' in vector_is['storeA' if assume_aligned else 'storeU']
    vectorize_inner_loops_and_adapt_load_stores(kernel_ast, vector_width, assume_aligned, nontemporal,
                                                strided, keep_loop_stop, assume_sufficient_line_padding)
    insert_vector_casts(kernel_ast, default_float_type)
Exemplo n.º 2
0
def default_create_kernel_parameters(generation_context, params):
    default_dtype = "float64" if generation_context.double_accuracy else 'float32'

    if generation_context.optimize_for_localhost:
        supported_instruction_sets = get_supported_instruction_sets()
        if supported_instruction_sets:
            default_vec_is = get_supported_instruction_sets()[-1]
        else:  # if cpuinfo package is not installed
            default_vec_is = 'sse'
    else:
        default_vec_is = None

    params['target'] = params.get('target', 'cpu')
    params['data_type'] = params.get('data_type', default_dtype)
    params['cpu_openmp'] = params.get('cpu_openmp', generation_context.openmp)
    params['cpu_vectorize_info'] = params.get('cpu_vectorize_info', {})

    vec = params['cpu_vectorize_info']
    vec['instruction_set'] = vec.get('instruction_set', default_vec_is)
    vec['assume_inner_stride_one'] = True
    vec['assume_aligned'] = vec.get('assume_aligned', False)
    vec['nontemporal'] = vec.get('nontemporal', False)
    return params
Exemplo n.º 3
0
def test_schaefer_turek():
    opt = {
        'vectorization': {
            'instruction_set': get_supported_instruction_sets()[-1],
            'assume_aligned': True
        },
        'openmp': 2
    }
    sc_2d_1 = schaefer_turek_2d(30,
                                max_lattice_velocity=0.08,
                                optimization=opt)
    sc_2d_1.run(30000)
    result = evaluate_static_quantities(sc_2d_1)
    assert 5.5 < result['c_D'] < 5.8
    assert 0.117 < result['DeltaP'] < 0.118
Exemplo n.º 4
0
def test_lbm_vectorization_short():
    print("Computing reference solutions")
    size1 = (64, 32)
    relaxation_rate = 1.8

    ldc1_ref = create_lid_driven_cavity(size1, relaxation_rate=relaxation_rate)
    ldc1_ref.run(10)

    lbm_config = LBMConfig(relaxation_rate=relaxation_rate)
    config = ps.CreateKernelConfig(
        cpu_vectorize_info={
            'instruction_set': get_supported_instruction_sets()[-1],
            'assume_aligned': True,
            'nontemporal': True,
            'assume_inner_stride_one': True,
            'assume_sufficient_line_padding': False,
        })
    ldc1 = create_lid_driven_cavity(size1,
                                    lbm_config=lbm_config,
                                    config=config,
                                    fixed_loop_sizes=False)
    ldc1.run(10)
Exemplo n.º 5
0
def test_hardware_query():
    instruction_sets = get_supported_instruction_sets()
    assert 'sse' in instruction_sets
Exemplo n.º 6
0
def aligned_empty(shape,
                  byte_alignment=True,
                  dtype=np.float64,
                  byte_offset=0,
                  order='C',
                  align_inner_coordinate=True):
    """
    Creates an aligned empty numpy array

    Args:
        shape: size of the array
        byte_alignment: alignment in bytes, for the start address of the array holds (a % byte_alignment) == 0
                        By default, use the maximum required by the CPU (or 512 bits if this cannot be detected).
                        When 'cacheline' is specified, the size of a cache line is used.
        dtype: numpy data type
        byte_offset: offset in bytes for position that should be aligned i.e. (a+byte_offset) % byte_alignment == 0
                    typically used to align first inner cell instead of ghost layer
        order: storage linearization order
        align_inner_coordinate: if True, the start of the innermost coordinate lines are aligned as well
    """
    if byte_alignment is True or byte_alignment == 'cacheline':
        from pystencils.backends.simd_instruction_sets import (
            get_supported_instruction_sets, get_cacheline_size,
            get_vector_instruction_set)

        type_name = BasicType.numpy_name_to_c(np.dtype(dtype).name)
        instruction_sets = get_supported_instruction_sets()
        if instruction_sets is None:
            byte_alignment = 64
        elif byte_alignment == 'cacheline':
            cacheline_sizes = [
                get_cacheline_size(is_name) for is_name in instruction_sets
            ]
            if all([s is None for s in cacheline_sizes]):
                widths = [
                    get_vector_instruction_set(type_name, is_name)['width'] *
                    np.dtype(dtype).itemsize for is_name in instruction_sets
                    if type(
                        get_vector_instruction_set(type_name, is_name)
                        ['width']) is int
                ]
                byte_alignment = 64 if all([s is None
                                            for s in widths]) else max(widths)
            else:
                byte_alignment = max(
                    [s for s in cacheline_sizes if s is not None])
        elif not any([
                type(get_vector_instruction_set(type_name, is_name)['width'])
                is int for is_name in instruction_sets
        ]):
            byte_alignment = 64
        else:
            byte_alignment = max([
                get_vector_instruction_set(type_name, is_name)['width'] *
                np.dtype(dtype).itemsize for is_name in instruction_sets if
                type(get_vector_instruction_set(type_name, is_name)
                     ['width']) is int
            ])
    if (not align_inner_coordinate) or (not hasattr(shape, '__len__')):
        size = np.prod(shape)
        d = np.dtype(dtype)
        # 2 * byte_alignment instead of 1 * byte_alignment to have slack in the end such that
        # vectorized loops can access vector_width elements further and don't require a tail loop
        tmp = np.empty(size * d.itemsize + 2 * byte_alignment, dtype=np.uint8)
        address = tmp.__array_interface__['data'][0]
        offset = (byte_alignment -
                  (address + byte_offset) % byte_alignment) % byte_alignment
        return tmp[offset:offset + size * d.itemsize].view(dtype=d).reshape(
            shape, order=order)
    else:
        if order == 'C':
            dim0_size = shape[-1]
            dim0 = -1
            dim1_size = np.prod(shape[:-1])
        else:
            dim0_size = shape[0]
            dim0 = 0
            dim1_size = np.prod(shape[1:])
        d = np.dtype(dtype)

        assert byte_alignment >= d.itemsize and byte_alignment % d.itemsize == 0
        padding = (byte_alignment - (
            (dim0_size * d.itemsize) % byte_alignment)) % byte_alignment

        size = dim1_size * padding + np.prod(shape) * d.itemsize
        tmp = aligned_empty(size,
                            byte_alignment=byte_alignment,
                            dtype=np.uint8,
                            byte_offset=byte_offset)
        tmp = tmp.view(dtype=dtype)
        shape_in_bytes = [i for i in shape]
        shape_in_bytes[dim0] = dim0_size + padding // d.itemsize
        tmp = tmp.reshape(shape_in_bytes, order=order)
        if tmp.flags['C_CONTIGUOUS']:
            tmp = tmp[..., :shape[-1]]
        else:
            tmp = tmp[:shape[0], ...]

        return tmp
Exemplo n.º 7
0
import numpy as np
import sympy as sp
import pytest

import pystencils as ps
from pystencils.astnodes import Block, Conditional
from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets, get_vector_instruction_set
from pystencils.enums import Target
from pystencils.cpu.vectorization import vec_all, vec_any

supported_instruction_sets = get_supported_instruction_sets(
) if get_supported_instruction_sets() else []


@pytest.mark.parametrize('instruction_set', supported_instruction_sets)
@pytest.mark.parametrize('dtype', ('float', 'double'))
def test_vec_any(instruction_set, dtype):
    if instruction_set in ['sve', 'rvv']:
        width = 4  # we don't know the actual value
    else:
        width = get_vector_instruction_set(dtype, instruction_set)['width']
    data_arr = np.zeros((4 * width, 4 * width),
                        dtype=np.float64 if dtype == 'double' else np.float32)

    data_arr[3:9, 1:3 * width - 1] = 1.0
    data = ps.fields(f"data: {dtype}[2D]", data=data_arr)

    c = [
        ps.Assignment(sp.Symbol("t1"), vec_any(data.center() > 0.0)),
        Conditional(vec_any(data.center() > 0.0),
                    Block([ps.Assignment(data.center(), 2.0)]))
Exemplo n.º 8
0
import numpy as np

import pytest

import pystencils as ps
from pystencils.rng import PhiloxFourFloats, PhiloxTwoDoubles, AESNIFourFloats, AESNITwoDoubles, random_symbol
from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets
from pystencils.cpu.cpujit import get_compiler_config
from pystencils.data_types import TypedSymbol
from pystencils.enums import Target

RNGs = {('philox', 'float'): PhiloxFourFloats, ('philox', 'double'): PhiloxTwoDoubles,
        ('aesni', 'float'): AESNIFourFloats, ('aesni', 'double'): AESNITwoDoubles}

instruction_sets = get_supported_instruction_sets()
if get_compiler_config()['os'] == 'windows':
    # skip instruction sets supported by the CPU but not by the compiler
    if 'avx' in instruction_sets and ('/arch:avx2' not in get_compiler_config()['flags'].lower()
                                      and '/arch:avx512' not in get_compiler_config()['flags'].lower()):
        instruction_sets.remove('avx')
    if 'avx512' in instruction_sets and '/arch:avx512' not in get_compiler_config()['flags'].lower():
        instruction_sets.remove('avx512')


@pytest.mark.parametrize('target,rng', (
(Target.CPU, 'philox'), (Target.CPU, 'aesni'), (Target.GPU, 'philox')))
@pytest.mark.parametrize('precision', ('float', 'double'))
@pytest.mark.parametrize('dtype', ('float', 'double'))
def test_rng(target, rng, precision, dtype, t=124, offsets=(0, 0), keys=(0, 0), offset_values=None):
    if target == Target.GPU:
        pytest.importorskip('pycuda')
Exemplo n.º 9
0
import numpy as np
import pytest

import pystencils as ps
from pystencils.backends.simd_instruction_sets import get_supported_instruction_sets
from lbmpy.scenarios import create_lid_driven_cavity
from lbmpy.creationfunctions import LBMConfig, LBMOptimisation


@pytest.mark.skipif(not get_supported_instruction_sets(),
                    reason='cannot detect CPU instruction set')
def test_lbm_vectorization_short():
    print("Computing reference solutions")
    size1 = (64, 32)
    relaxation_rate = 1.8

    ldc1_ref = create_lid_driven_cavity(size1, relaxation_rate=relaxation_rate)
    ldc1_ref.run(10)

    lbm_config = LBMConfig(relaxation_rate=relaxation_rate)
    config = ps.CreateKernelConfig(
        cpu_vectorize_info={
            'instruction_set': get_supported_instruction_sets()[-1],
            'assume_aligned': True,
            'nontemporal': True,
            'assume_inner_stride_one': True,
            'assume_sufficient_line_padding': False,
        })
    ldc1 = create_lid_driven_cavity(size1,
                                    lbm_config=lbm_config,
                                    config=config,
Exemplo n.º 10
0
def long_run(steady=True, **kwargs):
    if steady:  # scenario 2D-1 in the paper
        sc = schaefer_turek_2d(60, max_lattice_velocity=0.05, **kwargs)
    else:  # Scenario 2D-2 (unsteady)
        sc = schaefer_turek_2d(40, u_max=1.5, max_lattice_velocity=0.01)

    for i in range(100):
        sc.run(10000)
        res = evaluate_static_quantities(sc)
        print(res)
    import lbmpy.plot as plt
    plt.vector_field_magnitude(sc.velocity[:, :])
    plt.show()


@pytest.mark.skipif(not get_supported_instruction_sets(),
                    reason='cannot detect CPU instruction set')
def test_schaefer_turek():
    opt = {
        'vectorization': {
            'instruction_set': get_supported_instruction_sets()[-1],
            'assume_aligned': True
        },
        'openmp': 2
    }
    sc_2d_1 = schaefer_turek_2d(30,
                                max_lattice_velocity=0.08,
                                optimization=opt)
    sc_2d_1.run(30000)
    result = evaluate_static_quantities(sc_2d_1)
    assert 5.5 < result['c_D'] < 5.8