def setup_opencl(data, cube_size):
    import pycl

    blocking = True

    with timeify("Making context, loading kernel"):
        devices = pycl.clGetDeviceIDs()
        ctx = pycl.clCreateContext(devices=devices)
        queue = pycl.clCreateCommandQueue(ctx)

        program = pycl.clCreateProgramWithSource(ctx, SOURCE).build()

        score_matrix = program['score_matrix_to_rms']
        score_matrix.argtypes = (pycl.cl_mem, pycl.cl_mem, pycl.cl_mem,
                                 pycl.cl_mem, pycl.cl_mem, pycl.cl_int,
                                 pycl.cl_int)

    sub_divisions = cube_size**3

    with timeify("Creating buffers"):
        in_r_buf, in_evt1 = pycl.buffer_from_pyarray(queue,
                                                     data['in_r'],
                                                     blocking=blocking)
        in_g_buf, in_evt2 = pycl.buffer_from_pyarray(queue,
                                                     data['in_g'],
                                                     blocking=blocking)
        in_b_buf, in_evt3 = pycl.buffer_from_pyarray(queue,
                                                     data['in_b'],
                                                     blocking=blocking)

        out_r = data['out_r']
        out_r_buf, in_evt4 = pycl.buffer_from_pyarray(queue,
                                                      out_r,
                                                      blocking=blocking)

        score = array.array('f', [0 for x in range(sub_divisions)])
        score_buf, in_evt5 = pycl.buffer_from_pyarray(queue,
                                                      score,
                                                      blocking=blocking)

    with timeify("Run kernel r"):
        run_evt = score_matrix(
            #in_r_buf, in_g_buf, in_b_buf, out_r_buf, score_buf,
            in_r_buf,
            in_g_buf,
            in_b_buf,
            in_r_buf,
            score_buf,
            len(data['in_r']),
            cube_size,
            wait_for=[in_evt1, in_evt2, in_evt3, in_evt4,
                      in_evt5]).on(queue, sub_divisions)

    with timeify("Retrive data"):
        score_from_gpu, evt = pycl.buffer_to_pyarray(queue,
                                                     score_buf,
                                                     wait_for=run_evt,
                                                     like=score)

    return score_from_gpu
Exemplo n.º 2
0
 def get_queue(cls, device=get_gpu()):
     if device.value in cls.queues:
         return cls.queues[device.value]
     else:
         ctx = pycl.clCreateContext(devices=[device])
     queue = pycl.clCreateCommandQueue(context=ctx, device=device)
     cls.queues[device.value] = queue
     return queue
Exemplo n.º 3
0
def get_context_and_queue_from_devices(devices):
    key = tuple(device.vendor_id for device in devices)
    try:
        return devices_context_queue_map[key]
    except KeyError:
        context = pycl.clCreateContext(devices)
        queue = pycl.clCreateCommandQueue(context)
        devices_context_queue_map[key] = (context, queue)
        return devices_context_queue_map[key]
Exemplo n.º 4
0
def get_context_and_queue_from_devices(devices):
    key = tuple(device.vendor_id for device in devices)
    try:
        return devices_context_queue_map[key]
    except KeyError:
        context = pycl.clCreateContext(devices)
        queue = pycl.clCreateCommandQueue(context)
        devices_context_queue_map[key] = (context, queue)
        return devices_context_queue_map[key]
Exemplo n.º 5
0
 def __init__(self):
     """__init__
     Creates a context and queue that can be reused across calls to this
     function.
     """
     devices = cl.clGetDeviceIDs()
     self.device = devices[-1]
     self.context = cl.clCreateContext([self.device])
     self.queue = cl.clCreateCommandQueue(self.context)
Exemplo n.º 6
0
def setup_opencl(data, cube_size):
    import pycl

    blocking = True

    with timeify("Making context, loading kernel"):
        devices = pycl.clGetDeviceIDs()
        ctx = pycl.clCreateContext(devices = devices)
        queue = pycl.clCreateCommandQueue(ctx)

        program = pycl.clCreateProgramWithSource(ctx, SOURCE).build()

        score_matrix = program['score_matrix_to_rms']
        score_matrix.argtypes = (pycl.cl_mem, pycl.cl_mem, pycl.cl_mem,
                                 pycl.cl_mem, pycl.cl_mem, pycl.cl_int, pycl.cl_int)

    sub_divisions = cube_size**3

    with timeify("Creating buffers"):
        in_r_buf, in_evt1 = pycl.buffer_from_pyarray(queue, data['in_r'], blocking = blocking)
        in_g_buf, in_evt2 = pycl.buffer_from_pyarray(queue, data['in_g'], blocking = blocking)
        in_b_buf, in_evt3 = pycl.buffer_from_pyarray(queue, data['in_b'], blocking = blocking)

        out_r = data['out_r']
        out_r_buf, in_evt4 = pycl.buffer_from_pyarray(queue, out_r, blocking = blocking)

        score = array.array('f', [0 for x in range(sub_divisions)])
        score_buf, in_evt5 = pycl.buffer_from_pyarray(queue, score, blocking = blocking)


    with timeify("Run kernel r"):
        run_evt = score_matrix(
            #in_r_buf, in_g_buf, in_b_buf, out_r_buf, score_buf,
            in_r_buf, in_g_buf, in_b_buf, in_r_buf, score_buf,
            len(data['in_r']), cube_size,
            wait_for = [in_evt1, in_evt2, in_evt3, in_evt4, in_evt5]).on(queue,
                                                                         sub_divisions)

    with timeify("Retrive data"):
        score_from_gpu, evt = pycl.buffer_to_pyarray(queue, score_buf,
                                                     wait_for=run_evt,
                                                     like=score)

    return score_from_gpu
Exemplo n.º 7
0
def ocl_init( ocl_src ):
    platforms = cl.clGetPlatformIDs()
    use_devices = None
    for platform in platforms:
        try:
            devices = cl.clGetDeviceIDs(platform,device_type=cl.CL_DEVICE_TYPE_GPU)
            use_devices = devices[0:1] # arbitraily choose first device
        except cl.DeviceNotFoundError:
            pass
        if use_devices is not None: break
    if use_devices is None: raise ValueError( "no GPU openCL device found" )
    assert use_devices is not None
    print( "OpenCL use_devices: " + str(use_devices) )

    context = cl.clCreateContext(use_devices)
    queue = cl.clCreateCommandQueue(context)

    prog = cl.clCreateProgramWithSource( context, ocl_src ).build()
    print prog
    #run_mxplusb( prog, queue )
    run_conv( prog, queue )
Exemplo n.º 8
0
def ocl_init(ocl_src):
    platforms = cl.clGetPlatformIDs()
    use_devices = None
    for platform in platforms:
        try:
            devices = cl.clGetDeviceIDs(platform, device_type=cl.CL_DEVICE_TYPE_GPU)
            use_devices = devices[0:1]  # arbitraily choose first device
        except cl.DeviceNotFoundError:
            pass
        if use_devices is not None:
            break
    if use_devices is None:
        raise ValueError("no GPU openCL device found")
    assert use_devices is not None
    print ("OpenCL use_devices: " + str(use_devices))

    context = cl.clCreateContext(use_devices)
    queue = cl.clCreateCommandQueue(context)

    prog = cl.clCreateProgramWithSource(context, ocl_src).build()
    print prog
    # run_mxplusb( prog, queue )
    run_conv(prog, queue)
Exemplo n.º 9
0
 def __init__(self, array, output):
     self.device = clGetDeviceIDs()[-1]
     self.context = clCreateContext([self.device])
     self.queue = clCreateCommandQueue(self.context)
     self.array = array
     self.output = output
Exemplo n.º 10
0
 def __init__(self):
     self.context = cl.clCreateContextFromType()
     self.queue = cl.clCreateCommandQueue(
         context=self.context
     )  #, properties=cl.cl_command_queue_properties.CL_QUEUE_PROFILING_ENABLE)
     self.device = self.queue.device
Exemplo n.º 11
0
def get_unique_kernel_name():
    global count
    count += 1
    return "fn{}".format(count)


if backend in {"ocl", "opencl", "OCL"}:
    try:
        # platforms = cl.clGetPlatformIDs()
        # devices = cl.clGetDeviceIDs(platforms[1])
        devices = cl.clGetDeviceIDs(device_type=cl.CL_DEVICE_TYPE_GPU)
    except cl.DeviceNotFoundError:
        devices = cl.clGetDeviceIDs()
    context = cl.clCreateContext(devices[-1:])
    if os.environ.get("TRAVIS"):
        queues = [cl.clCreateCommandQueue(context)]
    else:
        queues = [cl.clCreateCommandQueue(context) for _ in range(8)]
        # queues = [
        #     cl.clCreateCommandQueue(
        #         context,
        #         properties=cl.CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
        #     ) for _ in range(10)
        # ]
    queue = queues[0]

hm_dir = os.path.join(tempfile.gettempdir(), "hindemith")

if not os.path.exists(hm_dir):
    os.mkdir(hm_dir)
unique_file_id = -1
Exemplo n.º 12
0
def get_unique_kernel_name():
    global count
    count += 1
    return "fn{}".format(count)


if backend in {"ocl", "opencl", "OCL"}:
    try:
        # platforms = cl.clGetPlatformIDs()
        # devices = cl.clGetDeviceIDs(platforms[1])
        devices = cl.clGetDeviceIDs(device_type=cl.CL_DEVICE_TYPE_GPU)
    except cl.DeviceNotFoundError:
        devices = cl.clGetDeviceIDs()
    context = cl.clCreateContext(devices[-1:])
    if os.environ.get("TRAVIS"):
        queues = [cl.clCreateCommandQueue(context)]
    else:
        queues = [
            cl.clCreateCommandQueue(
                context
            ) for _ in range(8)
        ]
        # queues = [
        #     cl.clCreateCommandQueue(
        #         context,
        #         properties=cl.CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE
        #     ) for _ in range(10)
        # ]
    queue = queues[0]

hm_dir = os.path.join(tempfile.gettempdir(), "hindemith")
Exemplo n.º 13
0
 def __init__(self):
     self.device = clGetDeviceIDs()[-1]
     self.context = clCreateContext([self.device])
     self.queue = clCreateCommandQueue(self.context)
Exemplo n.º 14
0
 def __init__(self):
     self.context = cl.clCreateContextFromType()
     self.queue = cl.clCreateCommandQueue(self.context, device=TARGET_GPU)
Exemplo n.º 15
0
 def __init__(self):
     self.context = cl.clCreateContextFromType()
     self.queue = cl.clCreateCommandQueue(self.context)
Exemplo n.º 16
0
    def transform(self, py_ast, program_config):
        """
        Convert the Python AST to a C AST according to the directions
        given in program_config.
        """
        arg_config, tuner_config = program_config
        dot = DotWriter()

        # hack yo
        ComputedVector._next_id = 0
        CopiedVector._next_id = 0

        # set up OpenCL context and memory spaces
        import pycl
        context = pycl.clCreateContextFromType(pycl.CL_DEVICE_TYPE_ALL)
        queues = [pycl.clCreateCommandQueue(context, dev) for dev in context.devices]
        c_func = ElementwiseFunction(context, queues)

        memories = [MainMemory()] + [OclMemory(q) for q in queues]
        main_memory = memories[0]
        ptrs = arg_config['ptrs']
        dtype, length = ptrs[0]._dtype_, arg_config['len']

        # pull stuff out of autotuner
        distribute_directives = tuner_config['distribute']
        reassoc_directives = tuner_config['reassociate']
        locs = [memories[loc] for loc in tuner_config['locs']]
        fusion_directives = tuner_config['fusion']
        parallelize_directives = tuner_config['parallelize']

        dot.write(py_ast)

        # run basic conversions
        proj = PyBasicConversions().visit(py_ast)
        dot.write(proj)

        # run platform-independent transformations
        proj = ApplyDistributiveProperty(distribute_directives).visit(proj)
        dot.write(proj)

        proj = ApplyAssociativeProperty(reassoc_directives).visit(proj)
        dot.write(proj)

        # set parameter types
        proj = VectorFinder(ptrs, main_memory).visit(proj)
        dot.write(proj)

        proj = LocationTagger(locs).visit(proj)
        dot.write(proj)

        proj = InsertIntermediates(main_memory).visit(proj)
        dot.write(proj)

        proj = CopyInserter(main_memory).visit(proj)
        dot.write(proj)

        proj = DoFusion(fusion_directives).visit(proj)
        dot.write(proj)

        proj = AllocateIntermediates(dtype, length).visit(proj)
        dot.write(proj, "postintermed")

        py_op = proj.find(FunctionDecl, name="py_op")
        schedules = FindParallelism(parallelize_directives).visit(py_op)
        py_op.defn = parallelize_tasks(schedules)
        dot.write(proj, "postparallel")

        proj = KernelOutliner(length).visit(proj)
        dot.write(proj)

        proj = LowerKernelCalls().visit(proj)
        dot.write(proj)

        proj = RefConverter().visit(proj)
        dot.write(proj)

        proj = LowerLoopsAndCopies(length).visit(proj)
        dot.write(proj)

        zipper = ArgZipper()
        proj = zipper.visit( Lifter().visit(proj) )
        c_func.extra_args = zipper.extra_args
        c_func.answer = zipper.answer
        dot.write(proj)

        fn = proj.find(FunctionDecl)
        return c_func.finalize("py_op", proj, fn.get_type())
Exemplo n.º 17
0
 def __init__(self):
     self.context = cl.clCreateContextFromType()
     self.queue = cl.clCreateCommandQueue(self.context)
Exemplo n.º 18
0
prefetch_option = os.getenv("LATTE_PREFETCH_MODE", "ON")

unroll_options = ["ON", "OFF"]

unroll_option = os.getenv("LATTE_UNROLL", "ON")

if parallel_strategy not in parallel_strategies:
    logger.warn("Invalid parallel strategy [%s], defaulting to OPENMP",
                parallel_strategy)
    parallel_strategy = "OPENMP"

nthreads = os.getenv("LATTE_NUM_THREADS", None)
if parallel_strategy == "OPENCL_SIMPLE_LOOP":
    import pycl as cl
    cl_ctx = cl.clCreateContext()
    cl_queue = cl.clCreateCommandQueue(cl_ctx)
elif parallel_strategy in ["SIMPLE_LOOP"
                           ] or parallel_strategy in ["FLOWGRAPH_LOOP"]:
    package_path = os.path.dirname(os.path.abspath(__file__))
    _file = FileTemplate(
        os.path.dirname(os.path.abspath(__file__)) + "/runtime/runtime.cpp",
        {"LATTE_PACKAGE_PATH": StringTemplate(package_path)})

    c_file = C.CFile("runtime", [_file])
    module = util.mpi_compile(ctree.nodes.Project([c_file]))
    init_nthreads = module.get_callable("init_nthreads",
                                        ctypes.CFUNCTYPE(None, ctypes.c_int))
    init_default = module.get_callable("init_default", ctypes.CFUNCTYPE(None))
    if nthreads is not None:
        init_nthreads(int(nthreads))
    else: