def setup_opencl(data, cube_size):
    import pycl

    blocking = True

    with timeify("Making context, loading kernel"):
        devices = pycl.clGetDeviceIDs()
        ctx = pycl.clCreateContext(devices=devices)
        queue = pycl.clCreateCommandQueue(ctx)

        program = pycl.clCreateProgramWithSource(ctx, SOURCE).build()

        score_matrix = program['score_matrix_to_rms']
        score_matrix.argtypes = (pycl.cl_mem, pycl.cl_mem, pycl.cl_mem,
                                 pycl.cl_mem, pycl.cl_mem, pycl.cl_int,
                                 pycl.cl_int)

    sub_divisions = cube_size**3

    with timeify("Creating buffers"):
        in_r_buf, in_evt1 = pycl.buffer_from_pyarray(queue,
                                                     data['in_r'],
                                                     blocking=blocking)
        in_g_buf, in_evt2 = pycl.buffer_from_pyarray(queue,
                                                     data['in_g'],
                                                     blocking=blocking)
        in_b_buf, in_evt3 = pycl.buffer_from_pyarray(queue,
                                                     data['in_b'],
                                                     blocking=blocking)

        out_r = data['out_r']
        out_r_buf, in_evt4 = pycl.buffer_from_pyarray(queue,
                                                      out_r,
                                                      blocking=blocking)

        score = array.array('f', [0 for x in range(sub_divisions)])
        score_buf, in_evt5 = pycl.buffer_from_pyarray(queue,
                                                      score,
                                                      blocking=blocking)

    with timeify("Run kernel r"):
        run_evt = score_matrix(
            #in_r_buf, in_g_buf, in_b_buf, out_r_buf, score_buf,
            in_r_buf,
            in_g_buf,
            in_b_buf,
            in_r_buf,
            score_buf,
            len(data['in_r']),
            cube_size,
            wait_for=[in_evt1, in_evt2, in_evt3, in_evt4,
                      in_evt5]).on(queue, sub_divisions)

    with timeify("Retrive data"):
        score_from_gpu, evt = pycl.buffer_to_pyarray(queue,
                                                     score_buf,
                                                     wait_for=run_evt,
                                                     like=score)

    return score_from_gpu
Пример #2
0
 def get_queue(cls, device=get_gpu()):
     if device.value in cls.queues:
         return cls.queues[device.value]
     else:
         ctx = pycl.clCreateContext(devices=[device])
     queue = pycl.clCreateCommandQueue(context=ctx, device=device)
     cls.queues[device.value] = queue
     return queue
Пример #3
0
def get_context_and_queue_from_devices(devices):
    key = tuple(device.vendor_id for device in devices)
    try:
        return devices_context_queue_map[key]
    except KeyError:
        context = pycl.clCreateContext(devices)
        queue = pycl.clCreateCommandQueue(context)
        devices_context_queue_map[key] = (context, queue)
        return devices_context_queue_map[key]
Пример #4
0
def get_context_and_queue_from_devices(devices):
    key = tuple(device.vendor_id for device in devices)
    try:
        return devices_context_queue_map[key]
    except KeyError:
        context = pycl.clCreateContext(devices)
        queue = pycl.clCreateCommandQueue(context)
        devices_context_queue_map[key] = (context, queue)
        return devices_context_queue_map[key]
Пример #5
0
 def __init__(self):
     """__init__
     Creates a context and queue that can be reused across calls to this
     function.
     """
     devices = cl.clGetDeviceIDs()
     self.device = devices[-1]
     self.context = cl.clCreateContext([self.device])
     self.queue = cl.clCreateCommandQueue(self.context)
Пример #6
0
def setup_opencl(data, cube_size):
    import pycl

    blocking = True

    with timeify("Making context, loading kernel"):
        devices = pycl.clGetDeviceIDs()
        ctx = pycl.clCreateContext(devices = devices)
        queue = pycl.clCreateCommandQueue(ctx)

        program = pycl.clCreateProgramWithSource(ctx, SOURCE).build()

        score_matrix = program['score_matrix_to_rms']
        score_matrix.argtypes = (pycl.cl_mem, pycl.cl_mem, pycl.cl_mem,
                                 pycl.cl_mem, pycl.cl_mem, pycl.cl_int, pycl.cl_int)

    sub_divisions = cube_size**3

    with timeify("Creating buffers"):
        in_r_buf, in_evt1 = pycl.buffer_from_pyarray(queue, data['in_r'], blocking = blocking)
        in_g_buf, in_evt2 = pycl.buffer_from_pyarray(queue, data['in_g'], blocking = blocking)
        in_b_buf, in_evt3 = pycl.buffer_from_pyarray(queue, data['in_b'], blocking = blocking)

        out_r = data['out_r']
        out_r_buf, in_evt4 = pycl.buffer_from_pyarray(queue, out_r, blocking = blocking)

        score = array.array('f', [0 for x in range(sub_divisions)])
        score_buf, in_evt5 = pycl.buffer_from_pyarray(queue, score, blocking = blocking)


    with timeify("Run kernel r"):
        run_evt = score_matrix(
            #in_r_buf, in_g_buf, in_b_buf, out_r_buf, score_buf,
            in_r_buf, in_g_buf, in_b_buf, in_r_buf, score_buf,
            len(data['in_r']), cube_size,
            wait_for = [in_evt1, in_evt2, in_evt3, in_evt4, in_evt5]).on(queue,
                                                                         sub_divisions)

    with timeify("Retrive data"):
        score_from_gpu, evt = pycl.buffer_to_pyarray(queue, score_buf,
                                                     wait_for=run_evt,
                                                     like=score)

    return score_from_gpu
Пример #7
0
def ocl_init( ocl_src ):
    platforms = cl.clGetPlatformIDs()
    use_devices = None
    for platform in platforms:
        try:
            devices = cl.clGetDeviceIDs(platform,device_type=cl.CL_DEVICE_TYPE_GPU)
            use_devices = devices[0:1] # arbitraily choose first device
        except cl.DeviceNotFoundError:
            pass
        if use_devices is not None: break
    if use_devices is None: raise ValueError( "no GPU openCL device found" )
    assert use_devices is not None
    print( "OpenCL use_devices: " + str(use_devices) )

    context = cl.clCreateContext(use_devices)
    queue = cl.clCreateCommandQueue(context)

    prog = cl.clCreateProgramWithSource( context, ocl_src ).build()
    print prog
    #run_mxplusb( prog, queue )
    run_conv( prog, queue )
Пример #8
0
def ocl_init(ocl_src):
    platforms = cl.clGetPlatformIDs()
    use_devices = None
    for platform in platforms:
        try:
            devices = cl.clGetDeviceIDs(platform, device_type=cl.CL_DEVICE_TYPE_GPU)
            use_devices = devices[0:1]  # arbitraily choose first device
        except cl.DeviceNotFoundError:
            pass
        if use_devices is not None:
            break
    if use_devices is None:
        raise ValueError("no GPU openCL device found")
    assert use_devices is not None
    print ("OpenCL use_devices: " + str(use_devices))

    context = cl.clCreateContext(use_devices)
    queue = cl.clCreateCommandQueue(context)

    prog = cl.clCreateProgramWithSource(context, ocl_src).build()
    print prog
    # run_mxplusb( prog, queue )
    run_conv(prog, queue)
Пример #9
0
 def __init__(self, array, output):
     self.device = clGetDeviceIDs()[-1]
     self.context = clCreateContext([self.device])
     self.queue = clCreateCommandQueue(self.context)
     self.array = array
     self.output = output
Пример #10
0

def get_unique_kernel_name():
    global count
    count += 1
    return "fn{}".format(count)


if backend in {"ocl", "opencl", "OCL"}:
    try:
        # platforms = cl.clGetPlatformIDs()
        # devices = cl.clGetDeviceIDs(platforms[1])
        devices = cl.clGetDeviceIDs(device_type=cl.CL_DEVICE_TYPE_GPU)
    except cl.DeviceNotFoundError:
        devices = cl.clGetDeviceIDs()
    context = cl.clCreateContext(devices[-1:])
    if os.environ.get("TRAVIS"):
        queues = [cl.clCreateCommandQueue(context)]
    else:
        queues = [cl.clCreateCommandQueue(context) for _ in range(8)]
        # queues = [
        #     cl.clCreateCommandQueue(
        #         context,
        #         properties=cl.CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
        #     ) for _ in range(10)
        # ]
    queue = queues[0]

hm_dir = os.path.join(tempfile.gettempdir(), "hindemith")

if not os.path.exists(hm_dir):
Пример #11
0
count = 0
def get_unique_kernel_name():
    global count
    count += 1
    return "fn{}".format(count)


if backend in {"ocl", "opencl", "OCL"}:
    try:
        # platforms = cl.clGetPlatformIDs()
        # devices = cl.clGetDeviceIDs(platforms[1])
        devices = cl.clGetDeviceIDs(device_type=cl.CL_DEVICE_TYPE_GPU)
    except cl.DeviceNotFoundError:
        devices = cl.clGetDeviceIDs()
    context = cl.clCreateContext(devices[-1:])
    if os.environ.get("TRAVIS"):
        queues = [cl.clCreateCommandQueue(context)]
    else:
        queues = [
            cl.clCreateCommandQueue(
                context
            ) for _ in range(8)
        ]
        # queues = [
        #     cl.clCreateCommandQueue(
        #         context,
        #         properties=cl.CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
        #     ) for _ in range(10)
        # ]
    queue = queues[0]
Пример #12
0
 def __init__(self):
     self.device = clGetDeviceIDs()[-1]
     self.context = clCreateContext([self.device])
     self.queue = clCreateCommandQueue(self.context)
Пример #13
0

#
### Specialist-Writtern Code ###
#
# The code below is written by an industrt SPECIALIST. This code is meant
# to be more complicated and requires specialized knowledge to write.
#


#
# Global Constants
#

WORK_GROUP_SIZE = 1024
devices = cl.clCreateContextFromType().devices + cl.clCreateContext().devices
TARGET_GPU = devices[1]
ITERATIONS = 0


class ConcreteReduction(ConcreteSpecializedFunction):
    def __init__(self):
        self.context = cl.clCreateContextFromType()
        self.queue = cl.clCreateCommandQueue(self.context, device=TARGET_GPU)

    def finalize(self, kernel, tree, entry_name, entry_type):
        self.kernel = kernel
        self._c_function = self._compile(entry_name, tree, entry_type)
        return self

    def __call__(self, A):
Пример #14
0
prefetch_options = ["ON", "OFF"]
prefetch_option = os.getenv("LATTE_PREFETCH_MODE", "ON")

unroll_options = ["ON", "OFF"]

unroll_option = os.getenv("LATTE_UNROLL", "ON")

if parallel_strategy not in parallel_strategies:
    logger.warn("Invalid parallel strategy [%s], defaulting to OPENMP",
                parallel_strategy)
    parallel_strategy = "OPENMP"

nthreads = os.getenv("LATTE_NUM_THREADS", None)
if parallel_strategy == "OPENCL_SIMPLE_LOOP":
    import pycl as cl
    cl_ctx = cl.clCreateContext()
    cl_queue = cl.clCreateCommandQueue(cl_ctx)
elif parallel_strategy in ["SIMPLE_LOOP"
                           ] or parallel_strategy in ["FLOWGRAPH_LOOP"]:
    package_path = os.path.dirname(os.path.abspath(__file__))
    _file = FileTemplate(
        os.path.dirname(os.path.abspath(__file__)) + "/runtime/runtime.cpp",
        {"LATTE_PACKAGE_PATH": StringTemplate(package_path)})

    c_file = C.CFile("runtime", [_file])
    module = util.mpi_compile(ctree.nodes.Project([c_file]))
    init_nthreads = module.get_callable("init_nthreads",
                                        ctypes.CFUNCTYPE(None, ctypes.c_int))
    init_default = module.get_callable("init_default", ctypes.CFUNCTYPE(None))
    if nthreads is not None:
        init_nthreads(int(nthreads))