예제 #1
0
파일: array.py 프로젝트: aerorohit/compyle
def linspace(start,
             stop,
             num,
             dtype=np.float64,
             backend='opencl',
             endpoint=True):
    if not type(num) == int:
        raise TypeError("num should be an integer but got %s" % type(num))
    if num <= 0:
        raise ValueError("Number of samples, %s, must be positive." % num)
    if backend == 'opencl':
        import pyopencl.array as gpuarray
        from .opencl import get_queue
        if endpoint:
            delta = (stop - start) / (num - 1)
        else:
            delta = (stop - start) / num
        out = gpuarray.arange(get_queue(), 0, num, 1, dtype=dtype)
        out = out * delta + start
    elif backend == 'cuda':
        import pycuda.gpuarray as gpuarray
        import pycuda.autoinit
        if endpoint:
            delta = (stop - start) / (num - 1)
        else:
            delta = (stop - start) / num
        out = gpuarray.arange(0, num, 1, dtype=dtype)
        out = out * delta + start
    else:
        out = np.linspace(start, stop, num, endpoint=endpoint, dtype=dtype)
    return wrap_array(out, backend)
예제 #2
0
def test_fancy_indexing(ctx_factory):
    if _PYPY:
        pytest.xfail("numpypy: multi value setting is not supported")
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    numpy_dest = np.zeros((4,), np.int32)
    numpy_idx = np.arange(3, 0, -1, dtype=np.int32)
    numpy_src = np.arange(8, 10, dtype=np.int32)
    numpy_dest[numpy_idx] = numpy_src

    cl_dest = cl_array.zeros(queue, (4,), np.int32)
    cl_idx = cl_array.arange(queue, 3, 0, -1, dtype=np.int32)
    cl_src = cl_array.arange(queue, 8, 10, dtype=np.int32)
    cl_dest[cl_idx] = cl_src

    assert np.all(numpy_dest == cl_dest.get())

    cl_idx[1] = 3
    cl_idx[2] = 2

    numpy_idx[1] = 3
    numpy_idx[2] = 2

    numpy_dest[numpy_idx] = numpy_src
    cl_dest[cl_idx] = cl_src

    assert np.all(numpy_dest == cl_dest.get())
예제 #3
0
def test_fancy_indexing(ctx_factory):
    if _PYPY:
        pytest.xfail("numpypy: multi value setting is not supported")
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    numpy_dest = np.zeros((4,), np.int32)
    numpy_idx = np.arange(3, 0, -1, dtype=np.int32)
    numpy_src = np.arange(8, 11, dtype=np.int32)
    numpy_dest[numpy_idx] = numpy_src

    cl_dest = cl_array.zeros(queue, (4,), np.int32)
    cl_idx = cl_array.arange(queue, 3, 0, -1, dtype=np.int32)
    cl_src = cl_array.arange(queue, 8, 11, dtype=np.int32)
    cl_dest[cl_idx] = cl_src

    assert np.all(numpy_dest == cl_dest.get())

    cl_idx[1] = 3
    cl_idx[2] = 2

    numpy_idx[1] = 3
    numpy_idx[2] = 2

    numpy_dest[numpy_idx] = numpy_src
    cl_dest[cl_idx] = cl_src

    assert np.all(numpy_dest == cl_dest.get())
예제 #4
0
def test_take(ctx_getter):
    context = ctx_getter()
    queue = cl.CommandQueue(context)

    idx = cl_array.arange(context, queue, 0, 200000, 2, dtype=numpy.uint32)
    a = cl_array.arange(context, queue, 0, 600000, 3, dtype=numpy.float32)
    result = cl_array.take(a, idx)
    assert ((3*idx).get() == result.get()).all()
예제 #5
0
def test_take(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    idx = cl_array.arange(queue, 0, 200000, 2, dtype=np.uint32)
    a = cl_array.arange(queue, 0, 600000, 3, dtype=np.float32)
    result = cl_array.take(a, idx)
    assert ((3 * idx).get() == result.get()).all()
예제 #6
0
def arange(start, stop, step, dtype=np.int32, backend='cython'):
    if backend == 'opencl':
        import pyopencl.array as gpuarray
        from .opencl import get_queue
        out = gpuarray.arange(get_queue(), start, stop, step, dtype=dtype)
    elif backend == 'cuda':
        import pycuda.gpuarray as gpuarray
        out = gpuarray.arange(start, stop, step, dtype=dtype)
    else:
        out = np.arange(start, stop, step, dtype=dtype)
    return wrap_array(out, backend)
예제 #7
0
def test_ldexp(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    for s in sizes:
        a = cl_array.arange(queue, s, dtype=np.float32)
        a2 = cl_array.arange(queue, s, dtype=np.float32) * 1e-3
        b = clmath.ldexp(a, a2)

        a = a.get()
        a2 = a2.get()
        b = b.get()

        for i in range(s):
            assert math.ldexp(a[i], int(a2[i])) == b[i]
예제 #8
0
def test_atan2pi(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    for s in sizes:
        a = (cl_array.arange(queue, s, dtype=np.float32) - np.float32(s / 2)) / 100
        a2 = (s / 2 - 1 - cl_array.arange(queue, s, dtype=np.float32)) / 100
        b = clmath.atan2pi(a, a2)

        a = a.get()
        a2 = a2.get()
        b = b.get()

        for i in range(s):
            assert abs(math.atan2(a[i], a2[i]) / math.pi - b[i]) < 1e-6
예제 #9
0
def test_fmod(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    for s in sizes:
        a = cl_array.arange(queue, s, dtype=np.float32) / 10
        a2 = cl_array.arange(queue, s, dtype=np.float32) / 45.2 + 0.1
        b = clmath.fmod(a, a2)

        a = a.get()
        a2 = a2.get()
        b = b.get()

        for i in range(s):
            assert math.fmod(a[i], a2[i]) == b[i]
예제 #10
0
def test_fmod(ctx_getter):
    context = ctx_getter()
    queue = cl.CommandQueue(context)

    for s in sizes:
        a = cl_array.arange(context, queue, s, dtype=numpy.float32)/10
        a2 = cl_array.arange(context, queue, s, dtype=numpy.float32)/45.2 + 0.1
        b = clmath.fmod(a, a2)

        a = a.get()
        a2 = a2.get()
        b = b.get()

        for i in range(s):
            assert math.fmod(a[i], a2[i]) == b[i]
예제 #11
0
def test_ldexp(ctx_getter):
    context = ctx_getter()
    queue = cl.CommandQueue(context)

    for s in sizes:
        a = cl_array.arange(context, queue, s, dtype=numpy.float32)
        a2 = cl_array.arange(context, queue, s, dtype=numpy.float32)*1e-3
        b = clmath.ldexp(a,a2)

        a = a.get()
        a2 = a2.get()
        b = b.get()

        for i in range(s):
            assert math.ldexp(a[i], int(a2[i])) == b[i]
예제 #12
0
    def test(ctx_factory):
        context = ctx_factory()
        queue = cl.CommandQueue(context)

        gpu_func = getattr(clmath, name)
        cpu_func = getattr(np, numpy_func_names.get(name, name))

        if has_double_support(context.devices[0]):
            if use_complex:
                dtypes = [np.float32, np.float64, np.complex64, np.complex128]
            else:
                dtypes = [np.float32, np.float64]
        else:
            if use_complex:
                dtypes = [np.float32, np.complex64]
            else:
                dtypes = [np.float32]

        for s in sizes:
            for dtype in dtypes:
                dtype = np.dtype(dtype)

                args = cl_array.arange(queue, a, b, (b - a) / s, dtype=dtype)
                if dtype.kind == "c":
                    args = args + dtype.type(1j) * args

                gpu_results = gpu_func(args).get()
                cpu_results = cpu_func(args.get())

                my_threshold = threshold
                if dtype.kind == "c" and isinstance(use_complex, float):
                    my_threshold = use_complex

                max_err = np.max(np.abs(cpu_results - gpu_results))
                assert (max_err <= my_threshold).all(), (max_err, name, dtype)
예제 #13
0
    def setup_arrays(self, nrays, nsamples, cutoff):

        prog_params = (nrays, nsamples, cutoff)

        if prog_params in self.array_cache:
            return self.array_cache[prog_params]

        else:
            arrays = ArraySet()
            arrays.scratch = cla.empty(self.queue, (nsamples, nrays),
                                       dtype=np.float32,
                                       allocator=self.memory_pool)

            arrays.result = cla.empty(self.queue, (nrays, ),
                                      dtype=np.int32,
                                      allocator=self.memory_pool)

            arrays.pre_cutoff = cla.empty(self.queue, (nrays, cutoff),
                                          dtype=np.float32,
                                          allocator=self.memory_pool)

            arrays.pre_cutoff_squared = cla.empty_like(arrays.pre_cutoff)

            arrays.idx = cla.arange(self.queue,
                                    0,
                                    cutoff * nrays,
                                    1,
                                    dtype=np.int32,
                                    allocator=self.memory_pool)

            self.array_cache[prog_params] = arrays
            return arrays
예제 #14
0
def sim_health_index(n_runs):
    # Set up OpenCL context and command queue
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)
    mem_pool = cltools.MemoryPool(cltools.ImmediateAllocator(queue))

    t0 = time.time()

    rho = 0.5
    mu = 3.0
    sigma = 1.0
    z_0 = mu

    # Generate an array of Normal Random Numbers on GPU of length n_sims*n_steps
    n_steps = int(4160)  #4160
    rand_gen = clrand.PhiloxGenerator(ctx)
    ran = rand_gen.normal(queue, (n_runs * n_steps),
                          np.float32,
                          mu=0,
                          sigma=1.0)

    # Establish boundaries for each simulated walk (i.e. start and end)
    # Necessary so that we perform scan only within rand walks and not between
    seg_boundaries = [1] + [0] * (n_steps - 1)
    seg_boundaries = np.array(seg_boundaries, dtype=np.uint8)
    seg_boundary_flags = np.tile(seg_boundaries, int(n_runs))
    seg_boundary_flags = cl_array.to_device(queue, seg_boundary_flags)

    # GPU: Define Segmented Scan Kernel, scanning simulations: f(n-1) + f(n)
    prefix_sum = GenericScanKernel(
        ctx,
        np.float32,
        arguments="__global float *ary, __global char *segflags, "
        "__global float *out, float rho, float mu",
        input_expr="segflags[i] ? (ary[i]+mu):(ary[i]+(1-rho)*mu)",
        scan_expr="across_seg_boundary ? (b):(rho*a+b)",
        neutral="0",
        is_segment_start_expr="segflags[i]",
        output_statement="out[i] = item",
        options=[])

    dev_result = cl_array.arange(queue,
                                 len(ran),
                                 dtype=np.float32,
                                 allocator=mem_pool)

    # Enqueue and Run Scan Kernel
    prefix_sum(ran, seg_boundary_flags, dev_result, rho, mu)

    # Get results back on CPU to plot and do final calcs, just as in Lab 1
    health_index_all = (dev_result.get().reshape(n_runs, n_steps).transpose())

    final_time = time.time()
    time_elapsed = final_time - t0

    print("Simulated %d Health Index in: %f seconds" % (n_runs, time_elapsed))
    #print(health_index_all)
    #print(ran.reshape(n_runs, n_steps).transpose())
    #plt.plot(health_index_all)
    return
    def align_and_damp(self, comps_align):
        if self.Args['Np_stay'] == 0:
            for comp in comps_align + [
                    'sort_indx',
            ]:
                self.DataDev[comp] = self.dev_arr(
                    shape=0, dtype=self.DataDev[comp].dtype)
            self.reset_num_parts()
            return

        WGS, WGS_tot = self.get_wgs(self.Args['Np_stay'])
        for comp in comps_align:
            buff_parts = self.dev_arr(dtype=self.DataDev[comp].dtype,
                                      shape=(self.Args['Np_stay'], ))

            self._data_align_dbl_knl(self.queue, (WGS_tot, ), (WGS, ),
                                     self.DataDev[comp].data, buff_parts.data,
                                     self.DataDev['sort_indx'].data,
                                     np.uint32(self.Args['Np_stay'])).wait()
            self.DataDev[comp] = buff_parts

        self.DataDev['sort_indx'] = arange(self.queue,
                                           0,
                                           self.Args['Np_stay'],
                                           1,
                                           dtype=np.uint32)
        self.reset_num_parts()
예제 #16
0
    def test(ctx_getter):
        context = ctx_getter()
        queue = cl.CommandQueue(context)

        gpu_func = getattr(clmath, name)
        cpu_func = getattr(numpy, numpy_func_names.get(name, name))

        if has_double_support(context.devices[0]):
            dtypes = [numpy.float32, numpy.float64]
        else:
            dtypes = [numpy.float32]

        for s in sizes:
            for dtype in dtypes:
                args = cl_array.arange(context,
                                       queue,
                                       a,
                                       b, (b - a) / s,
                                       dtype=numpy.float32)
                gpu_results = gpu_func(args).get()
                cpu_results = cpu_func(args.get())

                max_err = numpy.max(numpy.abs(cpu_results - gpu_results))
                assert (max_err <= threshold).all(), \
                        (max_err, name, dtype)
예제 #17
0
def test_arange(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    n = 5000
    a = cl_array.arange(queue, n, dtype=np.float32)
    assert (np.arange(n, dtype=np.float32) == a.get()).all()
예제 #18
0
def test_arange(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    n = 5000
    a = cl_array.arange(queue, n, dtype=np.float32)
    assert (np.arange(n, dtype=np.float32) == a.get()).all()
예제 #19
0
def test_bitonic_argsort(ctx_factory, size, dtype):
    import sys
    is_pypy = "__pypy__" in sys.builtin_module_names

    if not size and is_pypy:
        # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array
        pytest.xfail("pypy doesn't seem to handle as_strided "
                "on zero-sized arrays very well")

    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    device = queue.device
    if device.platform.vendor == "The pocl project" \
            and device.type & cl.device_type.GPU:
        pytest.xfail("bitonic argsort fails on POCL + Nvidia,"
                "at least the K40, as of pocl 1.6, 2021-01-20")

    dev = ctx.devices[0]
    if (dev.platform.name == "Portable Computing Language"
            and sys.platform == "darwin"):
        pytest.xfail("Bitonic sort crashes on Apple POCL")
    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
            "parallelism")
    if (dev.platform.name == "Portable Computing Language"
            and dtype == np.float64
            and get_pocl_version(dev.platform) < (1, 0)):
        pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0")
    if (dev.platform.name == "Intel(R) OpenCL" and size == 0):
        pytest.xfail("size-0 arange fails on Intel CL")

    if dtype == np.float64 and not has_double_support(dev):
        from pytest import skip
        skip("double precision not supported on %s" % dev)

    import pyopencl.clrandom as clrandom
    from pyopencl.bitonic_sort import BitonicSort

    index = cl_array.arange(queue, 0, size, 1, dtype=np.int32)
    m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234)

    sorterm = BitonicSort(ctx)

    ms = m.copy()
    # enqueue_marker crashes under CL 1.1 pocl if there is anything to wait for
    # (no clEnqueueWaitForEvents) https://github.com/inducer/pyopencl/pull/237
    if (dev.platform.name == "Portable Computing Language"
            and cl.get_cl_header_version() < (1, 2)):
        ms.finish()
        index.finish()
    ms, evt = sorterm(ms, idx=index, axis=0)

    assert np.array_equal(np.sort(m.get()), ms.get())

    # may be False because of identical values in array
    # assert np.array_equal(np.argsort(m.get()), index.get())

    # Check values by indices
    assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
예제 #20
0
def test_atan2pi(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    for s in sizes:
        a = (cl_array.arange(queue, s, dtype=np.float32) -
             np.float32(s / 2)) / 100
        a2 = (s / 2 - 1 - cl_array.arange(queue, s, dtype=np.float32)) / 100
        b = clmath.atan2pi(a, a2)

        a = a.get()
        a2 = a2.get()
        b = b.get()

        for i in range(s):
            assert abs(math.atan2(a[i], a2[i]) / math.pi - b[i]) < 1e-6
    def setup_arrays(self, nrays, nsamples, cutoff):

        prog_params = (nrays, nsamples, cutoff)

        if prog_params in self.array_cache:
            return self.array_cache[prog_params]

        else:
            arrays = ArraySet()
            arrays.scratch = cla.empty(self.queue,
                                 (nsamples, nrays),
                                 dtype=np.float32,
                                 allocator=self.memory_pool)

            arrays.result = cla.empty(self.queue,
                                (nrays,),
                                dtype=np.int32,
                                allocator=self.memory_pool)

            arrays.pre_cutoff = cla.empty(self.queue,
                                    (nrays, cutoff),
                                    dtype=np.float32,
                                    allocator=self.memory_pool)

            arrays.pre_cutoff_squared = cla.empty_like(arrays.pre_cutoff)

            arrays.idx = cla.arange(self.queue, 0, cutoff * nrays, 1,
                                    dtype=np.int32,
                                    allocator=self.memory_pool)

            self.array_cache[prog_params] = arrays
            return arrays
예제 #22
0
def test_arange(ctx_getter):
    context = ctx_getter()
    queue = cl.CommandQueue(context)

    n = 5000
    a = cl_array.arange(context, queue, n, dtype=numpy.float32)
    assert (numpy.arange(n, dtype=numpy.float32) == a.get()).all()
예제 #23
0
def test_bitonic_argsort(ctx_factory, size, dtype):
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    dev = ctx.devices[0]
    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
            "parallelism")
    if (dev.platform.name == "Portable Computing Language"
            and dtype == np.float64):
        pytest.xfail("Double precision bitonic sort doesn't work on POCL")

    import pyopencl.clrandom as clrandom
    from pyopencl.bitonic_sort import BitonicSort

    index = cl_array.arange(queue, 0, size, 1, dtype=np.int32)
    m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234)

    sorterm = BitonicSort(ctx)

    ms, evt = sorterm(m.copy(), idx=index, axis=0)

    assert np.array_equal(np.sort(m.get()), ms.get())

    # may be False because of identical values in array
    # assert np.array_equal(np.argsort(m.get()), index.get())

    # Check values by indices
    assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
예제 #24
0
def test_bitonic_argsort(ctx_factory, size, dtype):
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    dev = ctx.devices[0]
    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
                     "parallelism")
    if (dev.platform.name == "Portable Computing Language"
            and dtype == np.float64):
        pytest.xfail("Double precision bitonic sort doesn't work on POCL")

    import pyopencl.clrandom as clrandom
    from pyopencl.bitonic_sort import BitonicSort

    index = cl_array.arange(queue, 0, size, 1, dtype=np.int32)
    m = clrandom.rand(queue, (size, ), dtype, luxury=None, a=0, b=239432234)

    sorterm = BitonicSort(ctx)

    ms, evt = sorterm(m.copy(), idx=index, axis=0)

    assert np.array_equal(np.sort(m.get()), ms.get())

    # may be False because of identical values in array
    # assert np.array_equal(np.argsort(m.get()), index.get())

    # Check values by indices
    assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
예제 #25
0
def arange(start, stop, step, dtype=None, backend='cython'):
    if backend == 'opencl':
        import pyopencl.array as gpuarray
        dev_array = gpuarray.arange(get_queue(),
                                    start,
                                    stop,
                                    step,
                                    dtype=dtype)
    elif backend == 'cuda':
        import pycuda.gpuarray as gpuarray
        dev_array = gpuarray.arange(start, stop, step, dtype=dtype)
    else:
        return Array(np.arange(start, stop, step, dtype=dtype))
    wrapped_array = Array()
    wrapped_array.set_dev_array(dev_array)
    return wrapped_array
예제 #26
0
def test_arange(ctx_getter):
    context = ctx_getter()
    queue = cl.CommandQueue(context)

    n = 5000
    a = cl_array.arange(context, queue, n, dtype=numpy.float32)
    assert (numpy.arange(n, dtype=numpy.float32) == a.get()).all()
예제 #27
0
def test_mem_pool_with_arrays(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)
    mem_pool = cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue))

    a_dev = cl_array.arange(queue, 2000, dtype=np.float32, allocator=mem_pool)
    b_dev = cl_array.to_device(queue, np.arange(2000), allocator=mem_pool) + 4000

    assert a_dev.allocator is mem_pool
    assert b_dev.allocator is mem_pool
예제 #28
0
def test_mem_pool_with_arrays(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)
    mem_pool = cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue))

    a_dev = cl_array.arange(queue, 2000, dtype=np.float32, allocator=mem_pool)
    b_dev = cl_array.to_device(queue, np.arange(2000), allocator=mem_pool) + 4000

    assert a_dev.allocator is mem_pool
    assert b_dev.allocator is mem_pool
예제 #29
0
def test_reduction_not_first_argument(ctx_factory):
    # https://github.com/inducer/pyopencl/issues/535
    from pytest import importorskip
    importorskip("mako")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    n = 400
    a = cl_array.arange(queue, n, dtype=np.float32)
    b = cl_array.arange(queue, n, dtype=np.float32)

    from pyopencl.reduction import ReductionKernel
    krnl = ReductionKernel(context, np.float32, neutral="0",
            reduce_expr="a+b", map_expr="z*x[i]*y[i]",
            arguments="float z, __global float *x, __global float *y")

    my_dot_prod = krnl(0.1, a, b).get()

    assert abs(my_dot_prod - 0.1*np.sum(np.arange(n)**2)) < 1e-4
예제 #30
0
 def init_indices_buffers(self, image_width, image_height, kernels):
   mf = cl.mem_flags
   self.indices_host_buffer = numpy.arange(self.array_size, dtype=numpy.int32)
   self.indices_gpu_buffer = cl_array.arange(self.queue, 0, self.array_size, dtype=numpy.int32)
   self.sorted_indices_gpu_buffer = cl_array.zeros_like(self.indices_gpu_buffer)
                                        
   self.indices_host_back_buffers = {}
   for cell in kernels.keys():
     self.indices_host_back_buffers[cell] = {}
     for centre in kernels[cell].keys():
       self.indices_host_back_buffers[cell][centre] = numpy.zeros_like(self.source_host_buffer, 
                                                                       dtype=numpy.int32)    
예제 #31
0
def test_multi_put(ctx_factory):
    if _PYPY:
        pytest.xfail("numpypy: multi value setting is not supported")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    cl_arrays = [
        cl_array.arange(queue, 0, 3, dtype=np.float32) for i in range(1, 10)
    ]
    idx = cl_array.arange(queue, 0, 6, dtype=np.int32)
    out_arrays = [cl_array.zeros(queue, (10, ), np.float32) for i in range(9)]

    out_compare = [np.zeros((10, ), np.float32) for i in range(9)]
    for i, ary in enumerate(out_compare):
        ary[idx.get()] = np.arange(0, 6, dtype=np.float32)

    cl_array.multi_put(cl_arrays, idx, out=out_arrays)

    assert np.all(
        np.all(out_compare[i] == out_arrays[i].get()) for i in range(9))
예제 #32
0
def get_coefficients(signal, fs, cfs):
    """
    Executes all of the gammatone logic. Iterates over each input central frequency and executes the filter in OpenCL.
    :param np.array signal: input signal
    :param int fs: sampling frequency.
    :param iter cfs: Some iterator of central frequencies you'd like to filter the wave by.
    :return:
    """
    samp_g = cla.to_device(q, signal)
    tpt = np.float64((M_PI + M_PI) / fs)
    ts_g = cla.arange(q, 0, len(signal), dtype=np.float64)
    coefficients = []
    for cf in cfs:
        # Calculating the parameters for the given center frequencies
        tptbw = tpt * erb(cf) * BW_CORRECTION
        decay = np.exp(-tptbw)
        gain = np.float64(tptbw)
        # Setting up memory for everything.
        qcos = cla.empty_like(ts_g)
        qsin = cla.empty_like(ts_g)
        bm_g = cla.empty_like(ts_g)
        p0r_g = cla.empty_like(ts_g)
        p0i_g = cla.empty_like(ts_g)
        ur_g = cla.empty_like(ts_g)
        ui_g = cla.empty_like(ts_g)
        # Preparing the imaginary/real cyclical effect
        q_maker(qcos, qsin, ts_g, tpt, cf)
        cosx = samp_g * qcos
        sinx = samp_g * qsin
        # Performing the filtering operation
        p0(cosx, p0r_g, decay)
        p0(sinx, p0i_g, decay)
        # Preparing the memory to calculate basilar membrane displacement.
        p1r_g = p0r_g.copy(q)
        shift(p1r_g)
        p2r_g = p1r_g.copy(q)
        shift(p2r_g)
        p1i_g = p0i_g.copy(q)
        shift(p1i_g)
        p2i_g = p1i_g.copy(q)
        shift(p2i_g)
        # Calculating Basilar Membrane displacement
        u0(p0r_g, p1r_g, p2r_g, ur_g, decay)
        u0(p0i_g, p1i_g, p2i_g, ui_g, decay)
        bm_maker(ur_g, ui_g, bm_g, gain)
        # Append to the list
        cl_x = bm_g.get()
        # cl_x = normalize(cl_x)
        coefficients.append(cl_x)

    return np.row_stack(coefficients)
예제 #33
0
파일: mynp.py 프로젝트: ixtel/neurolabcl
def linspace(start, stop, num=50, endpoint=True, retstep=False, dtype=float_):
    #TODO: create native function
    if num<2: return array([start])
    if endpoint:
        mnum = num-1
    else:
        mnum = num
    diff = (stop - start) / mnum
    if endpoint:
        stop = stop + diff
    res = clarray.arange(queue, start, stop, diff, dtype=float_)[:num]
    res.__class__ = myclArray
    res.reinit()
    return res
예제 #34
0
def test_scalar_array_take_offset(ctx_factory):
    import pyopencl.array as cla

    ctx = ctx_factory()
    cq = cl.CommandQueue(ctx)

    knl = lp.make_kernel("{:}", """
        y = 133*x
        """, [lp.GlobalArg("x", shape=(), offset=lp.auto), ...])

    x_in_base = cla.arange(cq, 42, dtype=np.int32)
    x_in = x_in_base[13]

    evt, (out, ) = knl(cq, x=x_in)
    np.testing.assert_allclose(out.get(), 1729)
예제 #35
0
    def build_scratch(self, imshape):

        self.scratch = []
        self.index_scratch = []

        l = np.prod(imshape)
        self.array_indices = cla.arange(self.queue, 0, l, 1, dtype=np.int32)
        if l % self.runlen != 0:
            l += l % self.runlen
        while l > 1:
            l /= self.runlen
            self.scratch.append(cla.empty(self.queue, (l, ), np.float32))
            self.index_scratch.append(cla.empty(self.queue, (l, ), np.int32))

        self.imshape = imshape
예제 #36
0
def test_multi_put(ctx_factory):
    if _PYPY:
        pytest.xfail("numpypy: multi value setting is not supported")

    context = ctx_factory()
    queue = cl.CommandQueue(context)

    cl_arrays = [
        cl_array.arange(queue, 0, 3, dtype=np.float32)
        for i in range(1, 10)
    ]
    idx = cl_array.arange(queue, 0, 6, dtype=np.int32)
    out_arrays = [
        cl_array.zeros(queue, (10,), np.float32)
        for i in range(9)
    ]

    out_compare = [np.zeros((10,), np.float32) for i in range(9)]
    for i, ary in enumerate(out_compare):
        ary[idx.get()] = np.arange(0, 3, dtype=np.float32)

    cl_array.multi_put(cl_arrays, idx, out=out_arrays)

    assert np.all(np.all(out_compare[i] == out_arrays[i].get()) for i in range(9))
예제 #37
0
    def build_scratch(self, imshape):

        self.scratch = []
        self.index_scratch = []

        l = np.prod(imshape)
        self.array_indices = cla.arange(self.queue, 0, l, 1, dtype=np.int32)
        if l % self.runlen != 0:
            l += l % self.runlen
        while l > 1:
            l /= self.runlen
            self.scratch.append(cla.empty(self.queue, (l,), np.float32))
            self.index_scratch.append(cla.empty(self.queue, (l,), np.int32))

        self.imshape = imshape
예제 #38
0
파일: mynp.py 프로젝트: ixtel/neurolabcl
def arange(start, stop=0):
    if not stop:
        stop = start
        start = 0
    if type(start) == float or type(stop) == float:
        dtype=float_
    elif start>=0:
        dtype = np.uint32
    elif start<0:
        dtype = np.int32
    #print(start, stop, 1, dtype)
    res = clarray.arange(queue, start, stop, 1, dtype=dtype)
    res.__class__ = myclArray
    res.reinit()
    return res
예제 #39
0
def test_abs(ctx_getter):
    context = ctx_getter()
    queue = cl.CommandQueue(context)

    a = -cl_array.arange(context, queue, 111, dtype=numpy.float32)
    res = a.get()

    for i in range(111):
        assert res[i] <= 0

    a = abs(a)

    res = a.get()

    for i in range(111):
        assert abs(res[i]) >= 0
        assert res[i] == i
예제 #40
0
def test_modf(ctx_getter):
    context = ctx_getter()
    queue = cl.CommandQueue(context)

    for s in sizes:
        a = cl_array.arange(context, queue, s, dtype=numpy.float32)/10
        fracpart, intpart = clmath.modf(a)

        a = a.get()
        intpart = intpart.get()
        fracpart = fracpart.get()

        for i in range(s):
            fracpart_true, intpart_true = math.modf(a[i])

            assert intpart_true == intpart[i]
            assert abs(fracpart_true - fracpart[i]) < 1e-4
예제 #41
0
def test_bitonic_argsort(ctx_factory, size, dtype):
    import sys
    is_pypy = '__pypy__' in sys.builtin_module_names

    if not size and is_pypy:
        # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array
        pytest.xfail("pypy doesn't seem to handle as_strided "
                     "on zero-sized arrays very well")

    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    dev = ctx.devices[0]
    if (dev.platform.name == "Portable Computing Language"
            and sys.platform == "darwin"):
        pytest.xfail("Bitonic sort crashes on Apple POCL")
    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
                     "parallelism")
    if (dev.platform.name == "Portable Computing Language"
            and dtype == np.float64 and get_pocl_version(dev.platform) <
        (1, 0)):
        pytest.xfail(
            "Double precision bitonic sort doesn't work on POCL < 1.0")

    if dtype == np.float64 and not has_double_support(dev):
        from pytest import skip
        skip("double precision not supported on %s" % dev)

    import pyopencl.clrandom as clrandom
    from pyopencl.bitonic_sort import BitonicSort

    index = cl_array.arange(queue, 0, size, 1, dtype=np.int32)
    m = clrandom.rand(queue, (size, ), dtype, luxury=None, a=0, b=239432234)

    sorterm = BitonicSort(ctx)

    ms, evt = sorterm(m.copy(), idx=index, axis=0)

    assert np.array_equal(np.sort(m.get()), ms.get())

    # may be False because of identical values in array
    # assert np.array_equal(np.argsort(m.get()), index.get())

    # Check values by indices
    assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
예제 #42
0
def test_frexp(ctx_getter):
    context = ctx_getter()
    queue = cl.CommandQueue(context)

    for s in sizes:
        a = cl_array.arange(context, queue, s, dtype=numpy.float32)/10
        significands, exponents = clmath.frexp(a)

        a = a.get()
        significands = significands.get()
        exponents = exponents.get()

        for i in range(s):
            sig_true, ex_true = math.frexp(a[i])

            assert sig_true == significands[i]
            assert ex_true == exponents[i]
예제 #43
0
def test_abs(ctx_getter):
    context = ctx_getter()
    queue = cl.CommandQueue(context)

    a = -cl_array.arange(context, queue, 111, dtype=numpy.float32)
    res = a.get()

    for i in range(111):
        assert res[i] <= 0

    a = abs(a)

    res = a.get()

    for i in range (111):
        assert abs(res[i]) >= 0
        assert res[i] == i
예제 #44
0
def test_frexp(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    for s in sizes:
        a = cl_array.arange(queue, s, dtype=np.float32) / 10
        significands, exponents = clmath.frexp(a)

        a = a.get()
        significands = significands.get()
        exponents = exponents.get()

        for i in range(s):
            sig_true, ex_true = math.frexp(a[i])

            assert sig_true == significands[i]
            assert ex_true == exponents[i]
예제 #45
0
def test_modf(ctx_factory):
    context = ctx_factory()
    queue = cl.CommandQueue(context)

    for s in sizes:
        a = cl_array.arange(queue, s, dtype=np.float32) / 10
        fracpart, intpart = clmath.modf(a)

        a = a.get()
        intpart = intpart.get()
        fracpart = fracpart.get()

        for i in range(s):
            fracpart_true, intpart_true = math.modf(a[i])

            assert intpart_true == intpart[i]
            assert abs(fracpart_true - fracpart[i]) < 1e-4
예제 #46
0
def test_bitonic_argsort(ctx_factory, size, dtype):
    import sys
    is_pypy = '__pypy__' in sys.builtin_module_names

    if not size and is_pypy:
        # https://bitbucket.org/pypy/numpy/issues/53/specifying-strides-on-zero-sized-array
        pytest.xfail("pypy doesn't seem to handle as_strided "
                "on zero-sized arrays very well")

    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    dev = ctx.devices[0]
    if (dev.platform.name == "Portable Computing Language"
            and sys.platform == "darwin"):
        pytest.xfail("Bitonic sort crashes on Apple POCL")
    if (dev.platform.name == "Apple" and dev.type & cl.device_type.CPU):
        pytest.xfail("Bitonic sort won't work on Apple CPU: no workgroup "
            "parallelism")
    if (dev.platform.name == "Portable Computing Language"
            and dtype == np.float64
            and get_pocl_version(dev.platform) < (1, 0)):
        pytest.xfail("Double precision bitonic sort doesn't work on POCL < 1.0")

    if dtype == np.float64 and not has_double_support(dev):
        from pytest import skip
        skip("double precision not supported on %s" % dev)

    import pyopencl.clrandom as clrandom
    from pyopencl.bitonic_sort import BitonicSort

    index = cl_array.arange(queue, 0, size, 1, dtype=np.int32)
    m = clrandom.rand(queue, (size,), dtype, luxury=None, a=0, b=239432234)

    sorterm = BitonicSort(ctx)

    ms, evt = sorterm(m.copy(), idx=index, axis=0)

    assert np.array_equal(np.sort(m.get()), ms.get())

    # may be False because of identical values in array
    # assert np.array_equal(np.argsort(m.get()), index.get())

    # Check values by indices
    assert np.array_equal(m.get()[np.argsort(m.get())], m.get()[index.get()])
예제 #47
0
    def test(ctx_factory):
        context = ctx_factory()
        queue = cl.CommandQueue(context)

        gpu_func = getattr(clmath, name)
        cpu_func = getattr(np, numpy_func_names.get(name, name))

        dev = context.devices[0]

        if has_double_support(dev):
            if use_complex and has_struct_arg_count_bug(dev) == "apple":
                dtypes = [np.float32, np.float64, np.complex64]
            elif use_complex:
                dtypes = [np.float32, np.float64, np.complex64, np.complex128]
            else:
                dtypes = [np.float32, np.float64]
        else:
            if use_complex:
                dtypes = [np.float32, np.complex64]
            else:
                dtypes = [np.float32]

        for s in sizes:
            for dtype in dtypes:
                dtype = np.dtype(dtype)

                args = cl_array.arange(queue, a, b, (b - a) / s, dtype=dtype)
                if dtype.kind == "c":
                    # args = args + dtype.type(1j) * args
                    args = args + args * dtype.type(1j)

                gpu_results = gpu_func(args).get()
                cpu_results = cpu_func(args.get())

                my_threshold = threshold
                if dtype.kind == "c" and isinstance(use_complex, float):
                    my_threshold = use_complex

                max_err = np.max(np.abs(cpu_results - gpu_results))
                assert (max_err <= my_threshold).all(), \
                        (max_err, name, dtype)
예제 #48
0
    def test(ctx_getter):
        context = ctx_getter()
        queue = cl.CommandQueue(context)

        gpu_func = getattr(clmath, name)
        cpu_func = getattr(np, numpy_func_names.get(name, name))

        if has_double_support(context.devices[0]):
            dtypes = [np.float32, np.float64]
        else:
            dtypes = [np.float32]

        for s in sizes:
            for dtype in dtypes:
                args = cl_array.arange(queue, a, b, (b-a)/s, 
                        dtype=np.float32)
                gpu_results = gpu_func(args).get()
                cpu_results = cpu_func(args.get())

                max_err = np.max(np.abs(cpu_results - gpu_results))
                assert (max_err <= threshold).all(), \
                        (max_err, name, dtype)
예제 #49
0
import pyopencl.array as cl_array
import numpy
import numpy.linalg as la
import scipy as sp
from pyopencl.reduction import ReductionKernel
from pyopencl.elementwise import ElementwiseKernel

# Make shure we use the GPU for computations.
platform=cl.get_platforms()
gpu_devices=platform[0].get_devices(device_type=cl.device_type.GPU)
ctx=cl.Context(devices=gpu_devices)
queue = cl.CommandQueue(ctx)

# Integration, one integral, using reduce
length=0.001
vals=cl_array.arange(queue,0,100,length,dtype=numpy.double32)
# Kernel for reduce-code
krnlRed=ReductionKernel(ctx,numpy.double32,neutral="0",
    reduce_expr="a+b",map_expr="get_val(x[i])*%10.3f" % length,
    arguments="__global double *x",
    preamble="""
    double get_val(double x)
    {
        return x*x;
    }
    """)

# Generation of an array where each element is an evaluated integral.
tonum=1000000 # Number of elements.
# Array to send to the GPU.
p_gpu=cl_array.to_device(ctx,queue,sp.linspace(0,tonum,tonum+1).astype(numpy.double32))
예제 #50
0
# GPU: Define Segmented Scan Kernel, scanning simulations
prefix_sum = GenericScanKernel(
    ctx,
    np.float32,
    arguments="__global float *ary, __global char *segflags, "
    "__global float *out, float rho, float mu",
    input_expr="segflags[i] ? (ary[i]+mu):(ary[i]+(1-rho)*mu)",
    scan_expr="across_seg_boundary ? (b):(rho*a+b)",
    neutral="0",
    is_segment_start_expr="segflags[i]",
    output_statement="out[i] =(item>0) ? (0):(1)",
    options=[])

#set memory for results
dev_result = cl_array.arange(queue,
                             len(ran),
                             dtype=np.float32,
                             allocator=mem_pool)

#set parameters and call minimize function
x0 = np.zeros(1)
x0[0] = 0.1  #initial set-up of rho
xmin = -0.95
xmax = 0.95
rhomini = minimize(mini_parallel,
                   x0,
                   method='COBYLA',
                   bounds=((xmin, xmax), ),
                   options={'rhobeg': 0.01})

#report the results after minimization
final_time = time.time()
예제 #51
0
def sim_health_index(n_runs):
    # Set up OpenCL context and command queue
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)
    mem_pool = cltools.MemoryPool(cltools.ImmediateAllocator(queue))

    t0 = time.time()

    rho = 0.5
    mu = 3.0
    sigma = 1.0
    z_0 = mu

    # Generate an array of Normal Random Numbers on GPU of length n_sims*n_steps
    n_steps = int(4160)  #4160
    rand_gen = clrand.PhiloxGenerator(ctx)
    ran = rand_gen.normal(queue, (n_runs * n_steps),
                          np.float32,
                          mu=0,
                          sigma=1.0)

    # Establish boundaries for each simulated walk (i.e. start and end)
    # Necessary so that we perform scan only within rand walks and not between
    seg_boundaries = [1] + [0] * (n_steps - 1)
    seg_boundaries = np.array(seg_boundaries, dtype=np.uint8)
    seg_boundary_flags = np.tile(seg_boundaries, int(n_runs))
    seg_boundary_flags = cl_array.to_device(queue, seg_boundary_flags)

    # GPU: Define Segmented Scan Kernel, scanning simulations: rho*f(n-1)+(1-rho)*mu+ran
    # also output whether the value is smaller than 0 or not
    prefix_sum = GenericScanKernel(
        ctx,
        np.float32,
        arguments="__global float *ary, __global char *segflags, "
        "__global float *out, float rho, float mu",
        input_expr="segflags[i] ? (ary[i]+mu):(ary[i]+(1-rho)*mu)",
        scan_expr="across_seg_boundary ? (b):(rho*a+b)",
        neutral="0",
        is_segment_start_expr="segflags[i]",
        output_statement="out[i] =(item>0) ? (0):(1)",
        options=[])

    dev_result = cl_array.arange(queue,
                                 len(ran),
                                 dtype=np.float32,
                                 allocator=mem_pool)

    # print time of GPU simulation
    #sim_time = time.time()
    #time_elapsed = sim_time - t0
    #print("Simulated %d Health Index in: %f seconds"% (n_runs, time_elapsed))

    # Iterate For 200 rho values
    rho_set = np.linspace(-0.95, 0.95, 200)
    rho_avgt_t = []
    for rho in rho_set:
        #Enqueue and Run Scan Kernel
        #print(rho)
        prefix_sum(ran, seg_boundary_flags, dev_result, rho, mu)
        # Get results back on CPU to plot and do final calcs, just as in Lab 1
        health_index_all = (dev_result.get().reshape(n_runs, n_steps))

        # Find and averaged the index of first negative values across simulations
        t_all = []
        for s in health_index_all:
            if 1 in s:
                s = list(s)
                t = s.index(1)
            else:
                t = n_steps
            t_all.append(t)
        #print(len(t_all))
        avg_t = sum(t_all) / len(t_all)
        rho_avgt_t.append(avg_t)

    final_time = time.time()
    time_elapsed = final_time - t0
    print("Simulated %d Health Index for 200 rho values in: %f seconds" %
          (n_runs, time_elapsed))

    plt.plot(rho_set, rho_avgt_t)
    plt.title('Averaged periods of first negative index across Rho')
    plt.xlabel('Rho')
    plt.ylabel('Avged Period of first negative index')
    plt.savefig("GPU_rho_avgt_nruns%d.png" % (n_runs))

    max_period = max(rho_avgt_t)
    max_rho = rho_set[rho_avgt_t.index(max_period)]
    print("Max Period: %f; Max Rho: %f." % (max_period, max_rho))
    return
예제 #52
0
def test_speed(file_name , ctx_str):
    

    os.environ["PYOPENCL_CTX"] = ctx_str
    
    with open(file_name, "w") as myfile:
        myfile.write("RK order , space step , number of time steps , number of points, total time\n")

    # setup stuff
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)
    mf = cl.mem_flags
        

    
   
        
       

    for h in hs: 
        for order in orders:

            # meshes sizes
            hx = h
            hy = h
            hz = h
        
            # center locations
            xCenters   = np.arange( hx/2,  a + hx/2 , hx) 
            nxCenters  = len(xCenters)
            yCenters   = np.arange( hy/2,  b + hy/2 , hy) 
            nyCenters  = len(yCenters)
            zCenters   = np.arange( hy/2,  c + hy/2 , hy) 
            nzCenters  = len(zCenters)

            
# grids, to get the CFL
        #X, Y , Z= np.meshgrid(xCenters , yCenters , zCenters)
        #U  = PIB * psi * np.cos( Y * PIB ) * (   P * np.exp(A*X)  +      (1-P) * np.exp(B*X) -1  )/D;
        #V  =     - psi * np.sin( Y * PIB ) * ( A*P * np.exp(A*X)  +  B * (1-P) * np.exp(B*X)     )/D; 
        #W  = np.ones(U.shape)
            cfl = 3 #np.max(np.abs(U)) +  np.max(np.abs(V)) + 1

            # time step
            ht    = hx/(factor*cfl)  

# define the initial distribution of T
#T = np.exp( - (  
#        ((X-mu[0])/sig[0])**2 +
#        ((Y-mu[1])/sig[1])**2 +
#        ((Z-mu[2])/sig[2])**2 
#        )/2.0
#              )
            T = np.ones( (nyCenters, nxCenters ,nzCenters), dtype = np.float32) # T.astype(np.float32) # cast to float32 so it works with kernel
            
            # Do string manipulations under the hood
            prg_str = getstring.get3d(order , hx, hy, hz, ht, nxCenters, nyCenters, nzCenters)
        
            # compile
            prg = cl.Program(ctx, prg_str).build()

            # create memory pools       
            in_pool    = cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue))
            Tin_d      = cl_array.arange(queue, nxCenters*nyCenters*nzCenters, dtype=np.float32, allocator=in_pool)    
            out_pool   = cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue))
            Tout_d     = cl_array.arange(queue, nxCenters*nyCenters*nzCenters, dtype=np.float32, allocator=out_pool)

            # start the timing:
            start = time.time()

            # do time stepping and plotting.
            for i  in range(nSpeed):
            
                # Copy T into Tin_d
                Tin_d.set( T.ravel(), queue=queue )   
            
                # Apply the kernel here
                prg.rk_step(queue, T.shape, None, 
                        Tin_d.data, Tout_d.data)
    
                # Copy data into T
                Tout_d.get(queue=queue , ary=T)
        
            # End timing for this round
            end = time.time()

            # RK order , space step , number of time steps , number of points, total time\n")
            data = str(order) + " , " + str(h) +  " , " + str(nSpeed) + " , " + str(T.shape[0]*T.shape[1]*T.shape[2]) + " , " + str(end-start)  + "\n"
            with open(file_name, "a") as myfile:
                myfile.write(data)
예제 #53
0
파일: pix.py 프로젝트: yairdaon/advecl
def make_pix(rk_ord , set_up, target, h, final):
    
    os.environ["PYOPENCL_CTX"] = set_up
    
    # RK order 1 = Euler
    order = int(rk_ord)
     
    # meshes sizes
    hx = float(h)
    hy = float(h)

    # center locations
    xCenters   = np.arange( hx/2,  a + hx/2 , hx) 
    nxCenters  = len(xCenters)
    yCenters   = np.arange( hy/2,  b + hy/2 , hy) 
    nyCenters  = len(yCenters)

    # grids
    X, Y = np.meshgrid(xCenters , yCenters )
    U  = PIB * psi * np.cos( Y * PIB ) * (   P * np.exp(A*X)  +      (1-P) * np.exp(B*X) -1  )/D;
    V  =     - psi * np.sin( Y * PIB ) * ( A*P * np.exp(A*X)  +  B * (1-P) * np.exp(B*X)     )/D; 
    cfl = np.max(np.abs(U)) +  np.max(np.abs(V))

    # times
    ht    = hx/(factor*cfl)   # time step
    nt    = int(int(final)/ht)    # number of time steps

    # define the initial distribution of T
    T = np.exp( - (  
            ((X-mu[0])/sig[0])**4 +
            ((Y-mu[1])/sig[1])**4 
            )/2.0
              )
    T = T.astype(np.float32) # cast to float32 so it works with kernel

    # setup stuff
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)

    # Get the kernel from a string
    prg_str = getstring.get2d(order , hx, hy, ht, nxCenters, nyCenters)

    # compile
    prg = cl.Program(ctx, prg_str).build()

    # create memory pools
    in_pool    = cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue))
    Tin_d      = cl_array.arange(queue, nxCenters*nyCenters, dtype=np.float32, allocator=in_pool)
    out_pool   = cl_tools.MemoryPool(cl_tools.ImmediateAllocator(queue))
    Tout_d     = cl_array.arange(queue, nxCenters*nyCenters, dtype=np.float32, allocator=out_pool)


    # do time stepping and plotting.
    for i  in range(nt):
    
        if (nt % 20 == 0):
            # plot, not very interesting
            fig = plt.figure()
            CS = plt.contour(X, Y, T, levels=levels)
            plt.clabel(CS)
            plt.title("Tracer concentration. Spatial FEM and RK" + str(order) +" Time Steps.\n Step "
                      +str(i) + ",T = " + str(i*ht)+ " weeks.")
            if hasattr(plt, "streamplot"):
            	plt.streamplot(X, Y, U, V, color=U, linewidth=2, cmap=plt.cm.autumn)
            fig.savefig(target + '/frame' + str(i) + '.png')
            plt.close(fig)    
            
        # Copy T into Tin_d
        Tin_d.set( T.ravel(), queue=queue )   
        
        # Apply the kernel here
        prg.rk_step(queue, T.shape, None, 
                    Tin_d.data, Tout_d.data)
    
        # Copy data into T
        Tout_d.get(queue=queue , ary=T)
예제 #54
0
import pyopencl.array as cl_array
import numpy
import numpy.linalg as la
import scipy as sp
from pyopencl.reduction import ReductionKernel
from pyopencl.elementwise import ElementwiseKernel

# Make shure we use the GPU for computations.
platform=cl.get_platforms()
gpu_devices=platform[0].get_devices(device_type=cl.device_type.GPU)
ctx=cl.Context(devices=gpu_devices)
queue = cl.CommandQueue(ctx)

# Integration, one integral, using reduce
length=0.001
vals=cl_array.arange(queue,0,100,length,dtype=numpy.float32)
# Kernel for reduce-code
krnlRed=ReductionKernel(ctx,numpy.float32,neutral="0",
    reduce_expr="a+b",map_expr="get_val(x[i])*%10.3f" % length,
    arguments="__global float *x",
    preamble="""
    float get_val(float x)
    {
        return x*x;
    }
    """)

# Generation of an array where each element is an evaluated integral.
tonum=1000000 # Number of elements.
# Array to send to the GPU.
p_gpu=cl_array.to_device(ctx,queue,sp.linspace(0,tonum,tonum+1).astype(numpy.float32))
예제 #55
0
파일: mynp.py 프로젝트: ixtel/neurolabcl
def get_arng(size, dtype=np.int32):
    return clarray.arange(queue, 0, size, 1, dtype=dtype)