Exemplo n.º 1
0
    def test_create_program_from_source(self):
        oclSrc = "                                                             \
        kernel void add(global int* a, global int* b, global int* c) {         \
            size_t index = get_global_id(0);                                   \
            c[index] = a[index] + b[index];                                    \
        }                                                                      \
        kernel void axpy(global int* a, global int* b, global int* c, int d) { \
            size_t index = get_global_id(0);                                   \
            c[index] = a[index] + d*b[index];                                  \
        }"

        q = dpctl.SyclQueue("level_zero:gpu")
        dpctl_prog.create_program_from_source(q, oclSrc)
    def test_create_program_from_source(self):
        oclSrc = "                                                             \
        kernel void axpy(global int* a, global int* b, global int* c, int d) { \
            size_t index = get_global_id(0);                                   \
            c[index] = d*a[index] + b[index];                                  \
        }"

        with dpctl.device_context("opencl:gpu:0"):
            q = dpctl.get_current_queue()
            prog = dpctl_prog.create_program_from_source(q, oclSrc)
            axpyKernel = prog.get_sycl_kernel("axpy")

            abuf = dpctl_mem.MemoryUSMShared(1024 * np.dtype("i").itemsize)
            bbuf = dpctl_mem.MemoryUSMShared(1024 * np.dtype("i").itemsize)
            cbuf = dpctl_mem.MemoryUSMShared(1024 * np.dtype("i").itemsize)
            a = np.ndarray((1024), buffer=abuf, dtype="i")
            b = np.ndarray((1024), buffer=bbuf, dtype="i")
            c = np.ndarray((1024), buffer=cbuf, dtype="i")
            a[:] = np.arange(1024)
            b[:] = np.arange(1024, 0, -1)
            c[:] = 0
            d = 2
            args = []

            args.append(a.base)
            args.append(b.base)
            args.append(c.base)
            args.append(ctypes.c_int(d))

            r = [1024]

            q.submit(axpyKernel, args, r)
            self.assertTrue(np.allclose(c, a * d + b))
Exemplo n.º 3
0
    def test_create_program_from_source(self):
        oclSrc = "                                                             \
        kernel void add(global int* a, global int* b, global int* c) {         \
            size_t index = get_global_id(0);                                   \
            c[index] = a[index] + b[index];                                    \
        }                                                                      \
        kernel void axpy(global int* a, global int* b, global int* c, int d) { \
            size_t index = get_global_id(0);                                   \
            c[index] = a[index] + d*b[index];                                  \
        }"
        with dpctl.device_context("opencl:gpu:0"):
            q = dpctl.get_current_queue()
            prog = dpctl_prog.create_program_from_source(q, oclSrc)
            self.assertIsNotNone(prog)

            self.assertTrue(prog.has_sycl_kernel("add"))
            self.assertTrue(prog.has_sycl_kernel("axpy"))

            addKernel = prog.get_sycl_kernel("add")
            axpyKernel = prog.get_sycl_kernel("axpy")

            self.assertEqual(addKernel.get_function_name(), "add")
            self.assertEqual(axpyKernel.get_function_name(), "axpy")
            self.assertEqual(addKernel.get_num_args(), 3)
            self.assertEqual(axpyKernel.get_num_args(), 4)
Exemplo n.º 4
0
def produce_event(profiling=False):
    oclSrc = "                                                                 \
            kernel void add(global int* a) {                                   \
                size_t index = get_global_id(0);                               \
                a[index] = a[index] + 1;                                       \
            }"

    if profiling:
        q = dpctl.SyclQueue("opencl:cpu", property="enable_profiling")
    else:
        q = dpctl.SyclQueue("opencl:cpu")
    prog = dpctl_prog.create_program_from_source(q, oclSrc)
    addKernel = prog.get_sycl_kernel("add")

    bufBytes = 1024 * np.dtype("i").itemsize
    abuf = dpctl_mem.MemoryUSMShared(bufBytes, queue=q)
    a = np.ndarray((1024), buffer=abuf, dtype="i")
    a[:] = np.arange(1024)
    args = []

    args.append(a.base)
    r = [1024]
    ev = q.submit(addKernel, args, r)

    return ev
Exemplo n.º 5
0
 def test_create_program_from_source(self):
     oclSrc = "                                                             \
     kernel void add(global int* a, global int* b, global int* c) {         \
         size_t index = get_global_id(0);                                   \
         c[index] = a[index] + b[index];                                    \
     }                                                                      \
     kernel void axpy(global int* a, global int* b, global int* c, int d) { \
         size_t index = get_global_id(0);                                   \
         c[index] = a[index] + d*b[index];                                  \
     }"
     with dpctl.device_context("level0:gpu:0"):
         q = dpctl.get_current_queue()
         prog = dpctl_prog.create_program_from_source(q, oclSrc)
Exemplo n.º 6
0
def test_get_wait_list():
    if has_cpu():
        oclSrc = "                                                             \
            kernel void add_k(global float* a) {                               \
                size_t index = get_global_id(0);                               \
                a[index] = a[index] + 1;                                       \
            }                                                                  \
            kernel void sqrt_k(global float* a) {                              \
                size_t index = get_global_id(0);                               \
                a[index] = sqrt(a[index]);                                     \
            }                                                                  \
            kernel void sin_k(global float* a) {                               \
                size_t index = get_global_id(0);                               \
                a[index] = sin(a[index]);                                      \
            }"

        q = dpctl.SyclQueue("opencl:cpu")
        prog = dpctl_prog.create_program_from_source(q, oclSrc)
        addKernel = prog.get_sycl_kernel("add_k")
        sqrtKernel = prog.get_sycl_kernel("sqrt_k")
        sinKernel = prog.get_sycl_kernel("sin_k")

        bufBytes = 1024 * np.dtype("f").itemsize
        abuf = dpctl_mem.MemoryUSMShared(bufBytes, queue=q)
        a = np.ndarray((1024), buffer=abuf, dtype="f")
        a[:] = np.arange(1024)
        args = []

        args.append(a.base)
        r = [1024]
        ev_1 = q.submit(addKernel, args, r)
        ev_2 = q.submit(sqrtKernel, args, r, dEvents=[ev_1])
        ev_3 = q.submit(sinKernel, args, r, dEvents=[ev_2])

        try:
            wait_list = ev_3.get_wait_list()
        except ValueError:
            pytest.fail(
                "Failed to get a list of waiting events from SyclEvent")
        assert len(wait_list)
Exemplo n.º 7
0
def test_create_program_from_source(ctype_str, dtype, ctypes_ctor):
    try:
        q = dpctl.SyclQueue("opencl", property="enable_profiling")
    except dpctl.SyclQueueCreationError:
        pytest.skip("OpenCL queue could not be created")
    # OpenCL conventions for indexing global_id is opposite to
    # that of SYCL (and DPCTL)
    oclSrc = ("kernel void axpy("
              "   global " + ctype_str + " *a, global " + ctype_str + " *b,"
              "   global " + ctype_str + " *c, " + ctype_str + " d) {"
              "   size_t index = get_global_id(0);"
              "   c[index] = d * a[index] + b[index];"
              "}")
    prog = dpctl_prog.create_program_from_source(q, oclSrc)
    axpyKernel = prog.get_sycl_kernel("axpy")

    n_elems = 1024 * 512
    lws = 128
    bufBytes = n_elems * dtype.itemsize
    abuf = dpctl_mem.MemoryUSMShared(bufBytes, queue=q)
    bbuf = dpctl_mem.MemoryUSMShared(bufBytes, queue=q)
    cbuf = dpctl_mem.MemoryUSMShared(bufBytes, queue=q)
    a = np.ndarray((n_elems, ), buffer=abuf, dtype=dtype)
    b = np.ndarray((n_elems, ), buffer=bbuf, dtype=dtype)
    c = np.ndarray((n_elems, ), buffer=cbuf, dtype=dtype)
    a[:] = np.arange(n_elems)
    b[:] = np.arange(n_elems, 0, -1)
    c[:] = 0
    d = 2
    args = [a.base, b.base, c.base, ctypes_ctor(d)]

    assert n_elems % lws == 0

    for r in (
        [
            n_elems,
        ],
        [2, n_elems],
        [2, 2, n_elems],
    ):
        c[:] = 0
        timer = dpctl.SyclTimer()
        with timer(q):
            q.submit(axpyKernel, args, r).wait()
            ref_c = a * np.array(d, dtype=dtype) + b
        host_dt, device_dt = timer.dt
        assert type(host_dt) is float and type(device_dt) is float
        assert np.allclose(c, ref_c), "Failed for {}".format(r)

    for gr, lr in (
        (
            [
                n_elems,
            ],
            [lws],
        ),
        ([2, n_elems], [2, lws // 2]),
        ([2, 2, n_elems], [2, 2, lws // 4]),
    ):
        c[:] = 0
        timer = dpctl.SyclTimer()
        with timer(q):
            q.submit(axpyKernel, args, gr, lr, [dpctl.SyclEvent()]).wait()
            ref_c = a * np.array(d, dtype=dtype) + b
        host_dt, device_dt = timer.dt
        assert type(host_dt) is float and type(device_dt) is float
        assert np.allclose(c, ref_c), "Faled for {}, {}".formatg(r, lr)