예제 #1
0
def test_long_conflicting_names(context, q):
    cu_source = """
__device__ void mysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionname(float *d) {
    d[1] = 1.0f;
}

__device__ void mysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnameb(float *d) {
    d[2] = 3.0f;
}

__global__ void mysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamec(float *data) {
    data[0] = 123.0f;
    mysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionname(data);
    mysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnameb(data);
}
"""
    mangled_name = test_common.mangle('mysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamemysuperlongfunctionnamec', ['float *'])
    cl_source = test_common.cu_to_cl(cu_source, mangled_name)
    print('cl_source', cl_source)
    for line in cl_source.split("\n"):
        if line.strip().startswith('/*'):
            continue
        if not line.strip().replace('kernel ', '').strip().startswith('void'):
            continue
        name = line.replace('kernel ', '').replace('void ', '').split('(')[0]
        if name != '':
            print('name', name)
            assert len(name) <= 32
    test_common.build_kernel(context, cl_source, mangled_name[:31])
예제 #2
0
def test_short_names(context):
    cu_source = """
__device__ void funca(float *d);

__device__ void funca(float *d) {
    d[1] = 1.0f;
}

__device__ void funcb(float *d, int c) {
    d[2] = 3.0f + 5 - d[c];
}

__global__ void funck(float *data) {
    data[0] = 123.0f;
    funca(data);
    funcb(data, (int)data[6]);
    for(int i = 0; i < 1000; i++) {
        funcb(data + i, (int)data[i + 100]);
    }
}
"""
    mangled_name = test_common.mangle('funck', ['float *'])
    cl_source = test_common.cu_to_cl(cu_source, mangled_name)
    print('cl_source', cl_source)

    test_common.build_kernel(context, cl_source, mangled_name[:31])
예제 #3
0
def test_atomic_inc_int(context, q, int_data, int_data_gpu):
    # atomicCAS api based on usage in Eigen unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
    cu_code = """
__global__ void mykernel(int *data, int limit) {
    atomicInc((unsigned int *)data, limit);
}
"""
    cl_code = test_common.cu_to_cl(cu_code, '_Z8mykernelPii', 1)
    print('cl_code', cl_code)

    int_data[0] = 0
    int_data[1] = 0
    kernel = test_common.build_kernel(context, cl_code, '_Z8mykernelPii')
    cl.enqueue_copy(q, int_data_gpu, int_data)
    num_blocks = 4
    threads_per_block = 4
    modulus = 11
    kernel(q, (num_blocks * threads_per_block,), (threads_per_block,), int_data_gpu, offset_type(0), offset_type(0), np.int32(256), cl.LocalMemory(32))
    kernel(q, (num_blocks * threads_per_block,), (threads_per_block,), int_data_gpu, offset_type(0), offset_type(4), np.int32(modulus - 1), cl.LocalMemory(32))
    from_gpu = np.copy(int_data)
    cl.enqueue_copy(q, from_gpu, int_data_gpu)
    q.finish()
    print('from_gpu', from_gpu[:2])

    assert from_gpu[0] == num_blocks * threads_per_block
    assert from_gpu[1] == num_blocks * threads_per_block % modulus
예제 #4
0
def test_umulhi(context, q, int_data, int_data_gpu):
    ll_code = """
declare i32 @_Z8__umulhiii(i32, i32)

define void @test_umulhi(i32* %data) {
  %1 = load i32, i32* %data

  %2 = getelementptr i32, i32* %data, i32 1
  %3 = load i32, i32* %2

  %4 = getelementptr i32, i32* %data, i32 2
  %5 = load i32, i32* %4

  %6 = call i32 @_Z8__umulhiii(i32 %3, i32 %5)
  store i32 %6, i32* %data
  ret void
}
"""
    cl_code = test_common.ll_to_cl(ll_code, 'test_umulhi', 1)
    print('cl_code', cl_code)
    int_data[0] = 0
    int_data[1] = -50
    int_data[2] = 2523123
    cl.enqueue_copy(q, int_data_gpu, int_data)
    kernel = test_common.build_kernel(context, cl_code, 'test_umulhi')
    kernel(q, (32,), (32,), int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32))
    from_gpu = np.copy(int_data)
    cl.enqueue_copy(q, from_gpu, int_data_gpu)
    q.finish()
    expected = (np.uint64(np.uint32(2523123)) * np.uint64(np.uint32(-50))) // 2**32
    print('expected', expected)
    print('from_gpu[0]', from_gpu[0])
    assert expected == from_gpu[0].item()
예제 #5
0
def test_insertvalue(context, q, float_data, float_data_gpu):
    sourcecode = """
struct mystruct {
    int f0;
    float f1;
};

__device__ struct mystruct doSomething(struct mystruct foo, int somevalue);

__device__ struct mystruct doSomething(struct mystruct foo, int somevalue) {
    foo.f0 = somevalue;
    foo.f1 = 4.5f;
    return foo;
}

__global__ void somekernel(float *data) {
    struct mystruct foo;
    foo.f0 = 3;
    foo.f1 = 4.5;
    foo = doSomething(foo, data[2]);
    data[0] = (int)foo.f0;
    data[1] = foo.f1;
}
"""
    mangledname = test_common.mangle('somekernel', ['float *'])
    cl_code = test_common.cu_to_cl(sourcecode, mangledname)
    kernel = test_common.build_kernel(context, cl_code, mangledname)
예제 #6
0
def test_memcpy(context, q, int_data, int_data_gpu):
    ll_code = """
declare void @_Z6memcpyPvPKvm(i8*, i8*, i64)

define void @mykernel(i32* %data) {
  %1 = bitcast i32* %data to i8*

  %2 = getelementptr i32, i32* %data, i32 8
  %3 = bitcast i32* %2 to i8*

  call void @_Z6memcpyPvPKvm(i8 *%3, i8 *%1, i64 32)
  ret void
}
"""
    cl_code = test_common.ll_to_cl(ll_code, 'mykernel', num_clmems=1)
    print('cl_code', cl_code)
    for i in range(8):
        int_data[i] = 3 + i
    cl.enqueue_copy(q, int_data_gpu, int_data)
    kernel = test_common.build_kernel(context, cl_code, 'mykernel')
    kernel(q, (32, ), (32, ), int_data_gpu, offset_type(0), offset_type(0),
           cl.LocalMemory(32))
    from_gpu = np.copy(int_data)
    cl.enqueue_copy(q, from_gpu, int_data_gpu)
    q.finish()
    for i in range(8):
        print(i, from_gpu[8 + i])
        assert from_gpu[8 + i] == 3 + i
예제 #7
0
def test_inlining(context, q, float_data, float_data_gpu):

    cu_source = """
__global__ void myKernel(float *data) {
    data[0] = (data[3] * (data[1] + data[2])) / data[4];
    data[7] = (data[3] / (data[1] - data[2])) * data[4];
}
"""
    kernelName = test_common.mangle('myKernel', ['float *'])
    cl_sourcecode = test_common.cu_to_cl(cu_source, kernelName, num_clmems=1)
    print('cl_sourcecode', cl_sourcecode)
    kernel = test_common.build_kernel(context, cl_sourcecode, kernelName)

    for i in range(10):
        float_data[i] = i + 3
    cl.enqueue_copy(q, float_data_gpu, float_data)
    q.finish()
    # prog = cl.Program(context, sourcecode).build()
    # prog.__getattr__(kernelName)(
    kernel(q, (32, ), (32, ), float_data_gpu, offset_type(0),
           cl.LocalMemory(4))
    q.finish()
    float_data2 = np.zeros((1024, ), dtype=np.float32)
    cl.enqueue_copy(q, float_data2, float_data_gpu)
    q.finish()
    print('float_data2[0]', float_data2[0])
    d = float_data
    d2 = float_data2
    expect = (d[3] * (d[1] + d[2])) / d[4]
    assert abs(d2[0] - expect) < 1e-5
def test_int_expressions(context, q, int_data, int_data_gpu):

    cu_source = """
__global__ void myKernel(int *data) {
    data[0] = (data[10] | data[11]) == data[12];
}
"""
    kernelName = test_common.mangle('myKernel', ['int *'])
    cl_sourcecode = test_common.cu_to_cl(cu_source, kernelName, num_clmems=1)
    print('cl_sourcecode', cl_sourcecode)
    kernel = test_common.build_kernel(context, cl_sourcecode, kernelName)

    for i in range(50):
        int_data[i] = 0
    int_data[10] = 2
    int_data[11] = 1
    int_data[12] = 1
    cl.enqueue_copy(q, int_data_gpu, int_data)
    q.finish()
    # prog = cl.Program(context, sourcecode).build()
    # prog.__getattr__(kernelName)(
    kernel(q, (32, ), (32, ), int_data_gpu, offset_type(0), offset_type(0),
           cl.LocalMemory(4))
    q.finish()
    gpu_data = np.zeros((1024, ), dtype=np.int32)
    cl.enqueue_copy(q, gpu_data, int_data_gpu)
    q.finish()
    data = int_data
    actual = gpu_data[0]
    expected = int((data[10] | data[11]) == data[12])
    print('actual', actual, 'expected', expected)
    assert actual == expected
예제 #9
0
def test_sincos(context, q, float_data, float_data_gpu):

    cu_code = """
__global__ void mykernel(float *data) {
    sincosf(0.1, &data[0], &data[1]);
    sincosf(data[2], &data[3], &data[4]);
}
"""
    kernel_name = test_common.mangle('mykernel', ['float*'])
    cl_code = test_common.cu_to_cl(cu_code, kernel_name, num_clmems=1)
    print('cl_code', cl_code)

    float_data[2] = -0.3
    float_data_orig = np.copy(float_data)
    cl.enqueue_copy(q, float_data_gpu, float_data)

    kernel = test_common.build_kernel(context, cl_code, kernel_name)
    kernel(
        q, (32,), (32,),
        float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(4))
    q.finish()
    cl.enqueue_copy(q, float_data, float_data_gpu)
    q.finish()
    print(float_data[:5])
    assert abs(float_data[0] - math.sin(0.1)) < 1e-4
    assert abs(float_data[1] - math.cos(0.1)) < 1e-4
    assert abs(float_data[3] - math.sin(float_data_orig[2])) < 1e-4
    assert abs(float_data[4] - math.cos(float_data_orig[2])) < 1e-4
예제 #10
0
def test_ieeefloats(context, q, float_data, float_data_gpu):
    cu_code = """
__global__ void mykernel(double *data) {
    double d_neginfinity = -INFINITY;
    double d_posinfinity = INFINITY;
    float f_neginfinity = -INFINITY;
    float f_posinfinity = INFINITY;
    data[0] = INFINITY;
    data[1] = -INFINITY;
    data[2] = f_neginfinity;
    data[3] = f_posinfinity;
}
"""
    kernel_name = test_common.mangle('mykernel', ['double*'])
    cl_code = test_common.cu_to_cl(cu_code, kernel_name, num_clmems=1)
    kernel = test_common.build_kernel(context, cl_code, kernel_name)
    kernel(
        q, (32,), (32,),
        float_data_gpu, offset_type(0), cl.LocalMemory(4))
    q.finish()
    cl.enqueue_copy(q, float_data, float_data_gpu)
    q.finish()
    print(float_data[:4])
    assert float_data[0] == np.inf
    assert float_data[1] == - np.inf
    assert float_data[2] == - np.inf
    assert float_data[3] == np.inf
예제 #11
0
def test_sext(context, q, int_data, int_data_gpu):
    ll_code = """
define void @mykernel(i32* %data) {
  %1 = load i32, i32* %data
  %2 = sext i32 %1 to i64
  %3 = lshr i64 %2, 32
  %4 = trunc i64 %3 to i32
  store i32 %4, i32* %data
  ret void
}
"""
    cl_code = test_common.ll_to_cl(ll_code, 'mykernel', 1)
    print('cl_code', cl_code)
    for experiment in [{'in': 23, 'out': 0}, {'in': -1, 'out': -1}]:
        int_data[0] = experiment['in']
        cl.enqueue_copy(q, int_data_gpu, int_data)
        kernel = test_common.build_kernel(context, cl_code, 'mykernel')
        kernel(q, (32, ), (32, ), int_data_gpu, offset_type(0), offset_type(0),
               cl.LocalMemory(32))
        from_gpu = np.copy(int_data)
        cl.enqueue_copy(q, from_gpu, int_data_gpu)
        q.finish()
        # expected = (np.uint32(int_data[1]) * np.uint32(int_data[2])) >> 32
        expected = experiment['out']
        print('expected', expected)
        print('from_gpu[0]', from_gpu[0])
        assert expected == from_gpu[0].item()
        split_cl = cl_code.split('\n')
        found_long_cast = False
        for line in split_cl:
            if ' >> 32' in line and '(long)' in line:
                found_long_cast = True
        assert found_long_cast
예제 #12
0
def test_compile(context, cu_filepath, kernelname, num_clmems):
    with open(cu_filepath, 'r') as f:
        cu_code = f.read()

    ll_sourcecode = test_common.cu_to_ll(cu_code)

    for line in ll_sourcecode.split('\n'):
        if line.startswith('define') and kernelname in line:
            mangledname = line.split('@')[1].split('(')[0]
            break

    print('mangledname', mangledname)

    cl_code = test_common.cu_to_cl(cu_code, mangledname, num_clmems=num_clmems)
    print('got cl_code')

    test_common.build_kernel(context, cl_code, mangledname)
    print('after build kernel')
예제 #13
0
def test_compile(context, cu_filepath, kernelname):
    with open(cu_filepath, 'r') as f:
        cu_code = f.read()

    try:
        cl_code = test_common.cu_to_cl(cu_code, '')
    except:
        pass

    with open('/tmp/testprog-device.ll') as f:
        ll_code = f.read()

    for line in ll_code.split('\n'):
        if line.startswith('define') and kernelname in line:
            mangledname = line.split('@')[1].split('(')[0]
            break

    print('mangledname', mangledname)

    cl_code = test_common.cu_to_cl(cu_code, mangledname)

    test_common.build_kernel(context, cl_code, mangledname[:31])
예제 #14
0
def test_getelementptr_struct_local(context, q, float_data, float_data_gpu):
    cu_source = """
struct MyStruct {
    float* f0;
    float* f1;
};

    __global__ void foo(float *data) {
        struct MyStruct astruct;
        float *floats = astruct.f0;
    }
"""
    kernelName = test_common.mangle('foo', ['float *'])
    cl_sourcecode = test_common.cu_to_cl(cu_source, kernelName, num_clmems=1)
    print('cl_sourcecode', cl_sourcecode)
    kernel = test_common.build_kernel(context, cl_sourcecode, kernelName)
예제 #15
0
def test_getelementptr_struct_global(context, q, float_data, float_data_gpu):
    cu_source = """
struct MyStruct {
    float* f0;
    float* f1;
};

    __global__ void foo(struct MyStruct mystruct) {
        float *floats = mystruct.f0;
    }
"""
    # kernelName = test_common.mangle('foo', ['float *'])
    kernelName = '_Z3foo8MyStruct'
    cl_sourcecode = test_common.cu_to_cl(cu_source, kernelName, num_clmems=3)
    print('cl_sourcecode', cl_sourcecode)
    kernel = test_common.build_kernel(context, cl_sourcecode, kernelName)
예제 #16
0
def test_atomic_cas_unsignedint(context, q, int_data, int_data_gpu):
    # atomicCAS api based on usage in Eigen unsupported/Eigen/CXX11/src/Tensor/TensorReductionCuda.h
    cu_code = """
__global__ void mykernel(unsigned int *data) {
    int tid = threadIdx.x;
    int gid = blockIdx.x * blockDim.x + threadIdx.x;
    int numAttempts = 0;
    while(true) {
        numAttempts += 1;
        unsigned int oldValue = data[0];
        unsigned int newValue = oldValue + 1;
        unsigned int returnedOld = atomicCAS(data, oldValue, newValue);
        if(returnedOld == oldValue) {
            break;
        }
    }
    data[1 + gid] = numAttempts;
}
"""
    cl_code = test_common.cu_to_cl(cu_code, '_Z8mykernelPj', 1)
    print('cl_code', cl_code)

    int_data[0] = 0
    kernel = test_common.build_kernel(context, cl_code, '_Z8mykernelPj')
    cl.enqueue_copy(q, int_data_gpu, int_data)
    num_blocks = 4
    threads_per_block = 4
    kernel(q, (num_blocks * threads_per_block,), (threads_per_block,), int_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32))
    from_gpu = np.copy(int_data)
    cl.enqueue_copy(q, from_gpu, int_data_gpu)
    q.finish()
    print('from_gpu', from_gpu[:17])

    # check two things:
    # - final value of data[0] should equal num_blocks * threads_per_block
    # - the number of attempts should be unique, for each thread

    assert from_gpu[0] == num_blocks * threads_per_block
    seen_num_attempts = set()
    for i in range(num_blocks * threads_per_block):
        num_attempts = from_gpu[i + 1]
        if num_attempts in seen_num_attempts:
            raise Exception('already saw num_attempts %s' % num_attempts)
        seen_num_attempts.add(num_attempts)
예제 #17
0
def test_fabs_double(context, q, float_data, float_data_gpu):
    cu_code = """
__global__ void mykernel(float *data) {
    data[0] = fabs(data[0]);
}
"""
    cl_code = test_common.cu_to_cl(cu_code, '_Z8mykernelPf', 1)
    print('cl_code', cl_code)
    float_data[0] = -0.123
    cl.enqueue_copy(q, float_data_gpu, float_data)
    kernel = test_common.build_kernel(context, cl_code, '_Z8mykernelPf')
    kernel(q, (32,), (32,), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32))
    from_gpu = np.copy(float_data)
    cl.enqueue_copy(q, from_gpu, float_data_gpu)
    q.finish()
    expected = 0.123
    print('expected', expected)
    print('from_gpu[0]', from_gpu[0])
    assert abs(expected - from_gpu[0].item()) < 1e-4
예제 #18
0
def test_float_constants_from_ll(context, q, float_data, float_data_gpu):
    ll_code = """
define void @kernel_float_constants(float* nocapture %data) #1 {
  store float 0x3E7AD7F2A0000000, float* %data
  ret void
}
"""
    cl_code = test_common.ll_to_cl(ll_code, 'kernel_float_constants', 1)
    print('cl_code', cl_code)
    # try compiling it, just to be sure...
    kernel = test_common.build_kernel(context, cl_code, 'kernel_float_constants')
    kernel(q, (32,), (32,), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32))
    from_gpu = np.copy(float_data)
    cl.enqueue_copy(q, from_gpu, float_data_gpu)
    q.finish()
    print('from_gpu[0]', from_gpu[0])
    print(type(from_gpu[0]), type(1e-7))
    assert abs(float(from_gpu[0]) - 1e-7) <= 1e-10
    assert 'data[0] = 1e-07f' in cl_code
예제 #19
0
def test_local(context, q, float_data, float_data_gpu):
    cu_source = """
    __global__ void foo(float *data) {
        __shared__ float localmem[33];
        int tid = threadIdx.x;
        int warpId = tid % 32;
        localmem[warpId] = data[warpId];
        data[warpId] = localmem[warpId + 1];
    }
"""
    kernelName = test_common.mangle('foo', ['float *'])
    cl_sourcecode = test_common.cu_to_cl(cu_source, kernelName)
    print('cl_sourcecode', cl_sourcecode)
    kernel = test_common.build_kernel(context, cl_sourcecode, kernelName)
    float_data_orig = np.copy(float_data)
    kernel(q, (32,), (32,), float_data_gpu, offset_type(0), cl.LocalMemory(4))
    cl.enqueue_copy(q, float_data, float_data_gpu)
    q.finish()
    print('before', float_data_orig[:5])
    print('after', float_data[:5])
    assert np.abs(float_data_orig[1:32] - float_data[0:31]).max() <= 1e-4
예제 #20
0
def test_atomic_add_floats(context, q, float_data, float_data_gpu):
    ll_code = """
declare float @_Z9atomicAddIfET_PS0_S0_(float *, float)

define void @mykernel(float* nocapture %data) #1 {
  %1 = call float @_Z9atomicAddIfET_PS0_S0_(float * %data, float 3.25)
  ret void
}
"""
    cl_code = test_common.ll_to_cl(ll_code, 'mykernel', 1)
    print('cl_code', cl_code)
    # try compiling it, just to be sure...
    float_data[0] = 0
    kernel = test_common.build_kernel(context, cl_code, 'mykernel')
    cl.enqueue_copy(q, float_data_gpu, float_data)
    kernel(q, (128,), (32,), float_data_gpu, offset_type(0), offset_type(0), cl.LocalMemory(32))
    from_gpu = np.copy(float_data)
    cl.enqueue_copy(q, from_gpu, float_data_gpu)
    q.finish()
    print('from_gpu[0]', from_gpu[0])
    assert from_gpu[0] == 3.25 * 128
예제 #21
0
def try_build(context, filepath, kernelname):
    with open(filepath, 'r') as f:
        cucode = f.read()
    clcode = test_common.cu_to_cl(cucode, kernelname)
    test_common.build_kernel(context, clcode, kernelname[:31])
예제 #22
0
def try_build(context, ll_filepath, kernelname):
    with open(ll_filepath, 'r') as f:
        llcode = f.read()
    clcode = test_common.ll_to_cl(llcode, kernelname)
    test_common.build_kernel(context, clcode, kernelname)
예제 #23
0
def dotdotdot(context, dotdotdot_cl):
    kernelName = test_common.mangle('test_si', ['float *'])
    kernel = test_common.build_kernel(context, dotdotdot_cl, kernelName)
    return kernel