예제 #1
0
    def shadow_kernel(self, *args):
        """
        This shadow_kernel method will replace the kernel method that is
        defined in the sub-class of StencilKernel.  If in pure python mode,
        it will execute the kernel in python.  Else, it first checks if we
        have a cached version of the specialized function for the shapes of
        the arguments.  If so, we make a call to that function with our new
        arguments.  If not, we create a new SpecializedStencil with our
        arguments and original kernel method and call it with our arguments.
        :param args: The arguments to our original kernel method.
        :return: Undefined
        """
        output_grid = np.zeros_like(args[0])
        # output_grid = StencilGrid(args[0].shape)
        # output_grid.ghost_depth = self.ghost_depth
        if self.pure_python:
            self.pure_python_kernel(*(args + (output_grid, )))
            return output_grid

        if not self.specialized_sizes or\
                self.specialized_sizes != [y.shape for y in args]:
            self.specialized = SpecializedStencil(self.model, args,
                                                  output_grid, self,
                                                  self.testing)
            self.specialized_sizes = [arg.shape for arg in args]

        duration = c_float()
        # args = [arg.data for arg in args]
        args += (output_grid, byref(duration))
        self.specialized(*args)
        self.specialized.report(time=duration)
        # print("Took %.3fs" % duration.value)
        return output_grid
예제 #2
0
    def get_ir_nodes(self, args):
        tree = copy.deepcopy(self.original_tree)
        arg_cfg = self.args_to_subconfig(args)

        output = np.zeros_like(args[0])
        shape = output.shape

        param_types = [
            np.ctypeslib.ndpointer(arg.dtype, arg.ndim, arg.shape)
            for arg in arg_cfg + (output, )
        ]

        for transformer in [
            PythonToStencilModel(),
            self.backend(self.args, output, self.kernel, arg_cfg=arg_cfg,
                         fusable_nodes=None)]:
            tree = transformer.visit(tree)
        ocl_file = tree.find(OclFile)
        loop_body = ocl_file.body[0].defn
        params = ocl_file.body[0].params
        print(tree.files[0])
        for index, _type in enumerate(param_types):
            params[index].type = _type()

        return [Loop(shape, params[:-2], [params[-2]], param_types, loop_body,
                     [params[-1]])]
예제 #3
0
    def __call__(self, *args):
        """__call__

        :param *args:
        """
        if isinstance(args[0], hmarray):
            output = empty_like(args[0])
        else:
            output = np.zeros_like(args[0])
        # self.kernel.argtypes = tuple(
        #     cl_mem for _ in args + (output, )
        # ) + (localmem, )
        buffers = []
        events = []
        for index, arg in enumerate(args + (output, )):
            if isinstance(arg, hmarray):
                buffers.append(arg.ocl_buf)
            else:
                buf, evt = buffer_from_ndarray(self.queue, arg, blocking=True)
                # evt.wait()
                events.append(evt)
                buffers.append(buf)
                # self.kernel.setarg(index, buf, sizeof(cl_mem))
        cl.clWaitForEvents(*events)
        cl_error = 0
        if isinstance(self.kernel, list):
            kernels = len(self.kernel)
            if kernels == 2:
                cl_error = self._c_function(self.queue, self.kernel[0],
                                            self.kernel[1], *buffers)
            elif kernels == 3:
                cl_error = self._c_function(self.queue, self.kernel[0],
                                            self.kernel[1], self.kernel[2],
                                            *buffers)
            elif kernels == 4:
                cl_error = self._c_function(
                    self.queue, self.kernel[0], self.kernel[1], self.kernel[2],
                    self.kernel[3], *buffers
                )
        else:
            cl_error = self._c_function(self.queue, self.kernel, *buffers)

        if cl.cl_errnum(cl_error) != cl.cl_errnum.CL_SUCCESS:
            raise StencilException(
                "Error executing stencil kernel: opencl {} {}".format(
                    cl_error,
                    cl.cl_errnum(cl_error)
                )
            )
        if isinstance(output, hmarray):
            return output
        buf, evt = buffer_to_ndarray(
            self.queue, buffers[-1], output
        )
        evt.wait()

        return buf
예제 #4
0
    def python_kernel_wrapper(self, *args):
        """
        create an output buffer based on input_buffer then call the kernel
        :param args:
        :return:
        """
        input_grid = args[0]
        output = np.zeros_like(input_grid)
        self.kernel(*(args + (output,)))

        if self.is_copied:
            for point in self.halo_points(input_grid):
                output[point] = input_grid[point]

        return output
예제 #5
0
    def __call__(self, *args):
        """__call__

        :param *args: Arguments to be passed to our C function, the types
                      should match the types specified by the `entry_type`
                      that was passed to :attr: `finalize`.

        """
        # TODO: provide stronger type checking to give users better error
        # messages.
        duration = c_float()
        if self.output is not None:
            output = self.output
            self.output = None
        else:  # pragma no cover
            output = np.zeros_like(args[0])
        args += (output, byref(duration))
        self._c_function(*args)
        return output
예제 #6
0
    def __call__(self, *args):
        """__call__

        :param *args: Arguments to be passed to our C function, the types should
                      match the types specified by the `entry_type` that was
                      passed to :attr: `finalize`.

        """
        # TODO: provide stronger type checking to give users better error
        # messages.
        duration = c_float()
        if self.output is not None:
            output = self.output
            self.output = None
        else:
            output = np.zeros_like(args[0])
        args += (output, byref(duration))
        self._c_function(*args)
        return output
예제 #7
0
    def __call__(self, *args):
        """__call__

        :param *args:
        """
        if self.output is not None:
            output = self.output
            self.output = None
        else:
            output = np.zeros_like(args[0])
        self.kernel.argtypes = tuple(cl_mem
                                     for _ in args + (output, )) + (localmem, )
        bufs = []
        events = []
        for index, arg in enumerate(args + (output, )):
            buf, evt = buffer_from_ndarray(self.queue, arg, blocking=False)
            # evt.wait()
            events.append(evt)
            bufs.append(buf)
            self.kernel.setarg(index, buf, sizeof(cl_mem))
        cl.clWaitForEvents(*events)
        if self.device.type == cl.cl_device_type.CL_DEVICE_TYPE_GPU:
            local = 8
        else:
            local = 1
        localmem_size = reduce(operator.mul, (local + (self.ghost_depth * 2)
                                              for _ in range(args[0].ndim)),
                               sizeof(c_float))
        self.kernel.setarg(
            len(args) + 1, localmem(localmem_size), localmem_size)
        evt = clEnqueueNDRangeKernel(self.queue, self.kernel, self.global_size,
                                     tuple(local for _ in range(args[0].ndim)))
        evt.wait()
        buf, evt = buffer_to_ndarray(self.queue, bufs[-1], output)
        evt.wait()
        for mem in bufs:
            del mem

        return buf
예제 #8
0
 def pure_python(self, *args):
     output = np.zeros_like(args[0])
     self.kernel(*(args + (output, )))
     return output
예제 #9
0
 def generate_output(self, args):
     if self.output is not None:
         return self.output
     self.output = np.zeros_like(args[0])
     return self.output