예제 #1
0
def test_enqueue_task(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)
    mf = cl.mem_flags

    prg = cl.Program(
        ctx, """
    __kernel void
    reverse(__global const float *in, __global float *out, int n)
    {
        for (int i = 0;i < n;i++) {
            out[i] = in[n - 1 - i];
        }
    }
    """).build()
    knl = prg.reverse

    n = 100
    a = np.random.rand(n).astype(np.float32)
    b = np.empty_like(a)

    buf1 = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
    buf2 = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)

    knl.set_args(buf1, buf2, np.int32(n))
    cl.enqueue_task(queue, knl)

    cl.enqueue_copy(queue, b, buf2).wait()
    assert la.norm(a[::-1] - b) == 0
예제 #2
0
def test_enqueue_task(ctx_factory):
    ctx = ctx_factory()
    queue = cl.CommandQueue(ctx)
    mf = cl.mem_flags

    prg = cl.Program(ctx, """
    __kernel void
    reverse(__global const float *in, __global float *out, int n)
    {
        for (int i = 0;i < n;i++) {
            out[i] = in[n - 1 - i];
        }
    }
    """).build()
    knl = prg.reverse

    n = 100
    a = np.random.rand(n).astype(np.float32)
    b = np.empty_like(a)

    buf1 = cl.Buffer(ctx, mf.READ_ONLY | mf.COPY_HOST_PTR, hostbuf=a)
    buf2 = cl.Buffer(ctx, mf.WRITE_ONLY, b.nbytes)

    knl.set_args(buf1, buf2, np.int32(n))
    cl.enqueue_task(queue, knl)

    cl.enqueue_copy(queue, b, buf2).wait()
    assert la.norm(a[::-1] - b) == 0
    def fpga_function(self, benchmark=False):
        for layer in range(0, self.layers):
            rows_in = self.layer_height[layer]
            rows_out = self.layer_height[layer + 1]
            is_last_layer = layer + 1 == self.layers

            print("Running layer {} ({}->{}) kernel, with relu: {}".format(
                layer, rows_in, rows_out, 1 if not is_last_layer else 0))
            self.kForward.set_args(self.act_buffers[layer],
                                   self.weights_buffers[layer],
                                   self.bias_buffers[layer],
                                   self.act_buffers[layer + 1],
                                   self.minibatch_size, rows_in, rows_out,
                                   1 if not is_last_layer else 0)
            cl.enqueue_task(self.queue, self.kForward).wait()

            if is_last_layer:
                print("Running layer {} softmax kernel".format(layer))
                self.kForwardSoftMax.set_args(self.act_buffers[layer + 1],
                                              self.minibatch_size, rows_out)
                cl.enqueue_task(self.queue, self.kForwardSoftMax).wait()
예제 #4
0
    def bw_function(self, benchmark=False):
        for layer in range(self.layers, -1, -1):
            rows_in = self.layer_height[layer]
            is_last_layer = layer == self.layers
            if is_last_layer:
                #print("Running layer {} first_delta kernel".format(layer))
                self.kBackwardFirstDelta.set_args(self.act_buffers[layer],
                                                  self.ground_truth_buffer,
                                                  self.delta_buffers[layer],
                                                  self.minibatch_size,
                                                  self.layer_height[layer])
                cl.enqueue_task(self.queue, self.kBackwardFirstDelta).wait()
                #print("Running layer {} first_delta kernel. COMPLETED".format(layer))
            else:
                rows_out = self.layer_height[layer + 1]

                #print("Running layer {} backwards kernel".format(layer))
                self.kBackward.set_args(
                    self.act_buffers[layer], self.weights_buffers[layer],
                    self.dW_buffers[layer], self.bias_buffers[layer],
                    self.delta_buffers[layer], self.delta_buffers[layer + 1],
                    self.learn_rate, self.minibatch_size, rows_in, rows_out,
                    layer)
                cl.enqueue_task(self.queue, self.kBackward).wait()
예제 #5
0
파일: main.py 프로젝트: yuyichao/explore
def main():
    ctx = cl.create_some_context()
    queue = cl.CommandQueue(ctx)
    mf = cl.mem_flags
    buf = cl.Buffer(ctx, mf.READ_ONLY, 1000)
    buf2 = cl.Buffer(ctx, mf.READ_WRITE, 8)
    prg = cl.Program(ctx, """
    __kernel void
    get_addr(__global const int *in, __global long *out)
    {
        *out = (long)in;
    }
    """).build()

    knl = prg.get_addr
    knl.set_args(buf, buf2)
    cl.enqueue_task(queue, knl)

    b = np.empty([1], dtype=np.int64)
    cl.enqueue_copy(queue, b, buf2).wait()
    print(b[0])

    prg = cl.Program(ctx, """
    __kernel void
    get_addr(__global const int *in, __global long *out)
    {
        *out = (long)in;
    }
    """).build()
    knl = prg.get_addr
    knl.set_args(buf, buf2)
    cl.enqueue_task(queue, knl)

    b = np.empty([1], dtype=np.int64)
    cl.enqueue_copy(queue, b, buf2).wait()
    print(b[0])
예제 #6
0
context = cl.Context(devices=[dev])
queue = cl.CommandQueue(context, dev)

# Build program in the specified context using the kernel source code
prog = cl.Program(context, kernel_src)
try:
    prog.build(options=['-Werror'], devices=[dev])
except:
    print('Build log:')
    print(prog.get_build_info(dev, cl.program_build_info.LOG))
    raise

# Data and device buffers
data = np.arange(start=0, stop=NUM_SHORTS, dtype=np.uint16)
np.random.shuffle(data)
print('Input: ' + str(data))

mf = cl.mem_flags
data_buffer = cl.Buffer(context,
                        mf.READ_WRITE | mf.COPY_HOST_PTR,
                        hostbuf=data)

# Execute kernel
# radix_sort8(__global ushort8 *global_data)
kernel = prog.radix_sort8
kernel.set_arg(0, data_buffer)
cl.enqueue_task(queue, kernel)
cl.enqueue_copy(queue, dest=data, src=data_buffer, is_blocking=True)

print('Output: ' + str(data))
예제 #7
0
 def f(t, y_in, y_out, wait_for=None):
     knl.set_args(y_in.base_data, y_out.base_data)
     return cl.enqueue_task(queue, knl, wait_for=wait_for + [start_evt])
예제 #8
0
    def _post_kernel_reduction_task(self, nelems, reduction_operator):
        assert reduction_operator in [INC, MIN, MAX]

        def generate_code():
            def headers():
                if self.dtype == np.dtype('float64'):
                    return """
#if defined(cl_khr_fp64)
#if defined(cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#else
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#endif
#elif defined(cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#endif

"""
                else:
                    return ""

            op = {INC: 'INC', MIN: 'min', MAX: 'max'}[reduction_operator]

            return """
%(headers)s
#define INC(a,b) ((a)+(b))
__kernel
void global_%(type)s_%(dim)s_post_reduction (
  __global %(type)s* dat,
  __global %(type)s* tmp,
  __private int count
)
{
  __private %(type)s accumulator[%(dim)d];
  for (int j = 0; j < %(dim)d; ++j)
  {
    accumulator[j] = dat[j];
  }
  for (int i = 0; i < count; ++i)
  {
    for (int j = 0; j < %(dim)d; ++j)
    {
      accumulator[j] = %(op)s(accumulator[j], *(tmp + i * %(dim)d + j));
    }
  }
  for (int j = 0; j < %(dim)d; ++j)
  {
    dat[j] = accumulator[j];
  }
}
""" % {'headers': headers(), 'dim': self.cdim, 'type': self._cl_type, 'op': op}

        src, kernel = _reduction_task_cache.get(
            (self.dtype, self.cdim, reduction_operator), (None, None))
        if src is None:
            src = generate_code()
            prg = cl.Program(_ctx, src).build(options="-Werror")
            name = "global_%s_%s_post_reduction" % (self._cl_type, self.cdim)
            kernel = prg.__getattr__(name)
            _reduction_task_cache[
                (self.dtype, self.cdim, reduction_operator)] = (src, kernel)

        kernel.set_arg(0, self._array.data)
        kernel.set_arg(1, self._d_reduc_array.data)
        kernel.set_arg(2, np.int32(nelems))
        cl.enqueue_task(_queue, kernel).wait()
        self._array.get(queue=_queue, ary=self._data)
        self.state = DeviceDataMixin.BOTH
        del self._d_reduc_array
예제 #9
0
파일: opencl.py 프로젝트: jabooth/PyOP2
    def _post_kernel_reduction_task(self, nelems, reduction_operator):
        assert reduction_operator in [INC, MIN, MAX]

        def generate_code():
            def headers():
                if self.dtype == np.dtype('float64'):
                    return """
#if defined(cl_khr_fp64)
#if defined(cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#else
#pragma OPENCL EXTENSION cl_khr_fp64 : enable
#endif
#elif defined(cl_amd_fp64)
#pragma OPENCL EXTENSION cl_amd_fp64 : enable
#endif

"""
                else:
                    return ""

            op = {INC: 'INC', MIN: 'min', MAX: 'max'}[reduction_operator]

            return """
%(headers)s
#define INC(a,b) ((a)+(b))
__kernel
void global_%(type)s_%(dim)s_post_reduction (
  __global %(type)s* dat,
  __global %(type)s* tmp,
  __private int count
)
{
  __private %(type)s accumulator[%(dim)d];
  for (int j = 0; j < %(dim)d; ++j)
  {
    accumulator[j] = dat[j];
  }
  for (int i = 0; i < count; ++i)
  {
    for (int j = 0; j < %(dim)d; ++j)
    {
      accumulator[j] = %(op)s(accumulator[j], *(tmp + i * %(dim)d + j));
    }
  }
  for (int j = 0; j < %(dim)d; ++j)
  {
    dat[j] = accumulator[j];
  }
}
""" % {'headers': headers(), 'dim': self.cdim, 'type': self._cl_type, 'op': op}

        src, kernel = _reduction_task_cache.get(
            (self.dtype, self.cdim, reduction_operator), (None, None))
        if src is None:
            src = generate_code()
            prg = cl.Program(_ctx, src).build(options="-Werror")
            name = "global_%s_%s_post_reduction" % (self._cl_type, self.cdim)
            kernel = prg.__getattr__(name)
            _reduction_task_cache[
                (self.dtype, self.cdim, reduction_operator)] = (src, kernel)

        kernel.set_arg(0, self._array.data)
        kernel.set_arg(1, self._d_reduc_array.data)
        kernel.set_arg(2, np.int32(nelems))
        cl.enqueue_task(_queue, kernel).wait()
        self._array.get(queue=_queue, ary=self._data)
        self.state = DeviceDataMixin.BOTH
        del self._d_reduc_array
예제 #10
0
# Get device and context, create command queue and program
dev = utility.get_default_device()
context = cl.Context(devices=[dev], properties=None, dev_type=None, cache_dir=None)
queue = cl.CommandQueue(context, dev, properties=None)

# Build program in the specified context using the kernel source code
prog = cl.Program(context, kernel_src)
try:
    prog.build(options=["-Werror"], devices=[dev], cache_dir=None)
except:
    print("Build log:")
    print(prog.get_build_info(dev, cl.program_build_info.LOG))
    raise

# Data and device buffers
data = np.arange(start=0, stop=NUM_SHORTS, dtype=np.uint16)
np.random.shuffle(data)
print("Input: " + str(data))

mf = cl.mem_flags
data_buffer = cl.Buffer(context, mf.READ_WRITE | mf.COPY_HOST_PTR, hostbuf=data)

# Execute kernel
# radix_sort8(__global ushort8 *global_data)
kernel = prog.radix_sort8
kernel.set_arg(0, data_buffer)
cl.enqueue_task(queue, kernel)
cl.enqueue_copy(queue, dest=data, src=data_buffer, is_blocking=True)

print("Output: " + str(data))