示例#1
0
    def prepare(self, global_size, local_size=None, local_mem=0):
        global_size = wrap_in_tuple(global_size)
        self._local_mem = local_mem

        max_dims = self._thr.device_params.max_work_item_sizes
        if len(global_size) > len(max_dims):
            raise ValueError("Global size has too many dimensions")

        if local_size is not None:
            local_size = wrap_in_tuple(local_size)
            if len(local_size) != len(global_size):
                raise ValueError(
                    "Global/local work sizes have differing dimensions")
        else:
            local_size = find_local_size(global_size, max_dims,
                                         self.max_work_group_size)

        grid = []
        for gs, ls in zip(global_size, local_size):
            if gs % ls != 0:
                raise ValueError(
                    "Global sizes must be multiples of corresponding local sizes"
                )
            grid.append(gs // ls)

        # append missing dimensions, otherwise PyCUDA will complain
        self._local_size = local_size + (1, ) * (3 - len(grid))
        self._grid = tuple(grid) + (1, ) * (3 - len(grid))
示例#2
0
文件: ocl.py 项目: mgolub2/reikna
 def prepare(self, global_size, local_size=None, local_mem=0):
     # ``local_mem`` is ignored, since it cannot be easily passed to the kernel
     # (a special kernel argument is requred).
     if local_size is None:
         self._local_size = None
     else:
         self._local_size = wrap_in_tuple(local_size)
     self._global_size = wrap_in_tuple(global_size)
示例#3
0
 def prepare(self, global_size, local_size=None, local_mem=0):
     # ``local_mem`` is ignored, since it cannot be easily passed to the kernel
     # (a special kernel argument is requred).
     if local_size is None:
         self._local_size = None
     else:
         self._local_size = wrap_in_tuple(local_size)
     self._global_size = wrap_in_tuple(global_size)
示例#4
0
    def __init__(self, dtype, shape=None, strides=None, offset=0, nbytes=None):
        self.shape = tuple() if shape is None else wrap_in_tuple(shape)
        self.size = product(self.shape)
        self.dtype = dtypes.normalize_type(dtype)
        self.ctype = dtypes.ctype_module(self.dtype)

        default_strides = helpers.default_strides(self.shape,
                                                  self.dtype.itemsize)
        if strides is None:
            strides = default_strides
        else:
            strides = tuple(strides)
        self._default_strides = strides == default_strides
        self.strides = strides

        default_nbytes = helpers.min_buffer_size(self.shape,
                                                 self.dtype.itemsize,
                                                 self.strides)
        if nbytes is None:
            nbytes = default_nbytes
        self._default_nbytes = nbytes == default_nbytes
        self.nbytes = nbytes

        self.offset = offset
        self._cast = dtypes.cast(self.dtype)
示例#5
0
文件: ocl.py 项目: xexo7C8/reikna
    def array(self,
              shape,
              dtype,
              strides=None,
              offset=0,
              nbytes=None,
              allocator=None,
              base=None,
              base_data=None):

        if allocator is None:
            allocator = self.allocate

        dtype = dtypes.normalize_type(dtype)
        shape = wrap_in_tuple(shape)
        if nbytes is None:
            nbytes = min_buffer_size(shape,
                                     dtype.itemsize,
                                     strides=strides,
                                     offset=offset)

        if (offset != 0
                or strides is not None) and base_data is None and base is None:
            base_data = allocator(nbytes)
        elif base is not None:
            base_data = base.data

        return Array(self,
                     shape,
                     dtype,
                     strides=strides,
                     offset=offset,
                     allocator=allocator,
                     base_data=base_data,
                     nbytes=nbytes)
示例#6
0
文件: helpers.py 项目: mgolub2/reikna
def get_test_array(shape, dtype, strides=None, no_zeros=False, high=None):
    shape = wrap_in_tuple(shape)
    dtype = dtypes.normalize_type(dtype)

    if dtype.names is not None:
        result = numpy.empty(shape, dtype)
        for name in dtype.names:
            result[name] = get_test_array(shape, dtype[name], no_zeros=no_zeros, high=high)
    else:
        if dtypes.is_integer(dtype):
            low = 1 if no_zeros else 0
            if high is None:
                high = 100 # will work even with signed chars
            get_arr = lambda: numpy.random.randint(low, high, shape).astype(dtype)
        else:
            low = 0.01 if no_zeros else 0
            if high is None:
                high = 1.0
            get_arr = lambda: numpy.random.uniform(low, high, shape).astype(dtype)

        if dtypes.is_complex(dtype):
            result = get_arr() + 1j * get_arr()
        else:
            result = get_arr()

    if strides is not None:
        result = as_strided(result, result.shape, strides)

    return result
示例#7
0
文件: helpers.py 项目: ringw/reikna
def get_test_array(shape, dtype, strides=None, no_zeros=False, high=None):
    shape = wrap_in_tuple(shape)
    dtype = dtypes.normalize_type(dtype)

    if dtype.names is not None:
        result = numpy.empty(shape, dtype)
        for name in dtype.names:
            result[name] = get_test_array(shape,
                                          dtype[name],
                                          no_zeros=no_zeros,
                                          high=high)
    else:
        if dtypes.is_integer(dtype):
            low = 1 if no_zeros else 0
            if high is None:
                high = 100  # will work even with signed chars
            get_arr = lambda: numpy.random.randint(low, high, shape).astype(
                dtype)
        else:
            low = 0.01 if no_zeros else 0
            if high is None:
                high = 1.0
            get_arr = lambda: numpy.random.uniform(low, high, shape).astype(
                dtype)

        if dtypes.is_complex(dtype):
            result = get_arr() + 1j * get_arr()
        else:
            result = get_arr()

    if strides is not None:
        result = as_strided(result, result.shape, strides)

    return result
示例#8
0
 def __init__(self, dtype, shape=None, strides=None):
     self.shape = tuple() if shape is None else wrap_in_tuple(shape)
     self.size = product(self.shape)
     self.dtype = dtypes.normalize_type(dtype)
     self.ctype = dtypes.ctype_module(self.dtype)
     if strides is None:
         self.strides = tuple([
             self.dtype.itemsize * product(self.shape[i+1:]) for i in range(len(self.shape))])
     else:
         self.strides = strides
     self._cast = dtypes.cast(self.dtype)
示例#9
0
文件: signature.py 项目: ringw/reikna
 def __init__(self, dtype, shape=None, strides=None):
     self.shape = tuple() if shape is None else wrap_in_tuple(shape)
     self.size = product(self.shape)
     self.dtype = dtypes.normalize_type(dtype)
     self.ctype = dtypes.ctype_module(self.dtype)
     if strides is None:
         self.strides = tuple([
             self.dtype.itemsize * product(self.shape[i + 1:])
             for i in range(len(self.shape))
         ])
     else:
         self.strides = strides
     self._cast = dtypes.cast(self.dtype)
示例#10
0
    def __init__(self, arr_t, predicate, axes=None, output_arr_t=None):

        dims = len(arr_t.shape)

        if axes is None:
            axes = tuple(range(dims))
        else:
            axes = tuple(sorted(helpers.wrap_in_tuple(axes)))

        if len(set(axes)) != len(axes):
            raise ValueError("Cannot reduce twice over the same axis")

        if min(axes) < 0 or max(axes) >= dims:
            raise ValueError("Axes numbers are out of bounds")

        if hasattr(predicate.empty, 'dtype'):
            if arr_t.dtype != predicate.empty.dtype:
                raise ValueError(
                    "The predicate and the array must use the same data type")
            empty = predicate.empty
        else:
            empty = dtypes.cast(arr_t.dtype)(predicate.empty)

        remaining_axes = tuple(a for a in range(dims) if a not in axes)
        output_shape = tuple(arr_t.shape[a] for a in remaining_axes)

        if axes == tuple(range(dims - len(axes), dims)):
            self._transpose_axes = None
        else:
            self._transpose_axes = remaining_axes + axes

        self._operation = predicate.operation
        self._empty = empty

        if output_arr_t is None:
            output_arr_t = Type(arr_t.dtype, shape=output_shape)
        else:
            if output_arr_t.dtype != arr_t.dtype:
                raise ValueError(
                    "The dtype of the output array must be the same as that of the input array"
                )
            if output_arr_t.shape != output_shape:
                raise ValueError("Expected the output array shape " +
                                 str(output_shape) + ", got " +
                                 str(output_arr_t.shape))

        Computation.__init__(self, [
            Parameter('output', Annotation(output_arr_t, 'o')),
            Parameter('input', Annotation(arr_t, 'i'))
        ])
示例#11
0
    def prepare(self, global_size, local_size=None, local_mem=0):
        global_size = wrap_in_tuple(global_size)
        self._local_mem = local_mem

        max_dims = self._thr.device_params.max_work_item_sizes
        if len(global_size) > len(max_dims):
            raise ValueError("Global size has too many dimensions")

        if local_size is not None:
            local_size = wrap_in_tuple(local_size)
            if len(local_size) != len(global_size):
                raise ValueError("Global/local work sizes have differing dimensions")
        else:
            local_size = find_local_size(global_size, max_dims, self.max_work_group_size)

        grid = []
        for gs, ls in zip(global_size, local_size):
            if gs % ls != 0:
                raise ValueError("Global sizes must be multiples of corresponding local sizes")
            grid.append(gs // ls)

        # append missing dimensions, otherwise PyCUDA will complain
        self._local_size = local_size + (1,) * (3 - len(grid))
        self._grid = tuple(grid) + (1,) * (3 - len(grid))
示例#12
0
def normalize_constant_arrays(constant_arrays):
    if constant_arrays is None:
        return {}

    normalized = {}
    for name, metadata in constant_arrays.items():
        if hasattr(metadata, 'shape') and hasattr(metadata, 'dtype'):
            shape, dtype = metadata.shape, metadata.dtype
        else:
            shape, dtype = metadata
            shape = wrap_in_tuple(shape)
        length = product(shape)
        normalized[name] = (length, dtype)

    return normalized
示例#13
0
文件: api.py 项目: fjarri/reikna
def normalize_constant_arrays(constant_arrays):
    if constant_arrays is None:
        return {}

    normalized = {}
    for name, metadata in constant_arrays.items():
        if hasattr(metadata, 'shape') and hasattr(metadata, 'dtype'):
            shape, dtype = metadata.shape, metadata.dtype
        else:
            shape, dtype = metadata
            shape = wrap_in_tuple(shape)
        length = product(shape)
        normalized[name] = (length, dtype)

    return normalized
示例#14
0
文件: reduce.py 项目: fjarri/reikna
    def __init__(self, arr_t, predicate, axes=None, output_arr_t=None):

        dims = len(arr_t.shape)

        if axes is None:
            axes = tuple(range(dims))
        else:
            axes = tuple(sorted(helpers.wrap_in_tuple(axes)))

        if len(set(axes)) != len(axes):
            raise ValueError("Cannot reduce twice over the same axis")

        if min(axes) < 0 or max(axes) >= dims:
            raise ValueError("Axes numbers are out of bounds")

        if hasattr(predicate.empty, 'dtype'):
            if arr_t.dtype != predicate.empty.dtype:
                raise ValueError("The predicate and the array must use the same data type")
            empty = predicate.empty
        else:
            empty = dtypes.cast(arr_t.dtype)(predicate.empty)

        remaining_axes = tuple(a for a in range(dims) if a not in axes)
        output_shape = tuple(arr_t.shape[a] for a in remaining_axes)

        if axes == tuple(range(dims - len(axes), dims)):
            self._transpose_axes = None
        else:
            self._transpose_axes = remaining_axes + axes

        self._operation = predicate.operation
        self._empty = empty

        if output_arr_t is None:
            output_arr_t = Type(arr_t.dtype, shape=output_shape)
        else:
            if output_arr_t.dtype != arr_t.dtype:
                raise ValueError(
                    "The dtype of the output array must be the same as that of the input array")
            if output_arr_t.shape != output_shape:
                raise ValueError(
                    "Expected the output array shape " + str(output_shape) +
                    ", got " + str(output_arr_t.shape))

        Computation.__init__(self, [
            Parameter('output', Annotation(output_arr_t, 'o')),
            Parameter('input', Annotation(arr_t, 'i'))])
示例#15
0
    def array(self,
              shape,
              dtype,
              strides=None,
              offset=0,
              nbytes=None,
              allocator=None,
              base=None,
              base_data=None):

        # In PyCUDA, the default allocator is not None, but a default alloc object
        if allocator is None:
            allocator = cuda.mem_alloc

        dtype = dtypes.normalize_type(dtype)
        shape = wrap_in_tuple(shape)
        if nbytes is None:
            nbytes = int(
                min_buffer_size(shape,
                                dtype.itemsize,
                                strides=strides,
                                offset=offset))

        if (offset != 0
                or strides is not None) and base_data is None and base is None:
            base_data = allocator(nbytes)
        elif base is not None:
            if isinstance(base, Array):
                base_data = base.base_data
            else:
                base_data = base.gpudata

        return Array(self,
                     shape,
                     dtype,
                     strides=strides,
                     allocator=allocator,
                     offset=offset,
                     base_data=base_data,
                     nbytes=nbytes)
示例#16
0
    def __init__(self, dtype, shape=None, strides=None, offset=0, nbytes=None):
        self.shape = tuple() if shape is None else wrap_in_tuple(shape)
        self.size = product(self.shape)
        self.dtype = dtypes.normalize_type(dtype)
        self.ctype = dtypes.ctype_module(self.dtype)

        default_strides = helpers.default_strides(self.shape, self.dtype.itemsize)
        if strides is None:
            strides = default_strides
        else:
            strides = tuple(strides)
        self._default_strides = strides == default_strides
        self.strides = strides

        default_nbytes = helpers.min_buffer_size(self.shape, self.dtype.itemsize, self.strides)
        if nbytes is None:
            nbytes = default_nbytes
        self._default_nbytes = nbytes == default_nbytes
        self.nbytes = nbytes

        self.offset = offset
        self._cast = dtypes.cast(self.dtype)
示例#17
0
文件: cuda.py 项目: fjarri/reikna
    def array(
            self, shape, dtype, strides=None, offset=0, nbytes=None,
            allocator=None, base=None, base_data=None):

        # In PyCUDA, the default allocator is not None, but a default alloc object
        if allocator is None:
            allocator = cuda.mem_alloc

        dtype = dtypes.normalize_type(dtype)
        shape = wrap_in_tuple(shape)
        if nbytes is None:
            nbytes = int(min_buffer_size(shape, dtype.itemsize, strides=strides, offset=offset))

        if (offset != 0 or strides is not None) and base_data is None and base is None:
            base_data = allocator(nbytes)
        elif base is not None:
            if isinstance(base, Array):
                base_data = base.base_data
            else:
                base_data = base.gpudata

        return Array(
            self, shape, dtype, strides=strides, allocator=allocator,
            offset=offset, base_data=base_data, nbytes=nbytes)
示例#18
0
    def __init__(self, device_params, virtual_global_size,
            virtual_local_size=None, max_local_size=None):

        virtual_global_size = wrap_in_tuple(virtual_global_size)
        if virtual_local_size is not None:
            virtual_local_size = wrap_in_tuple(virtual_local_size)
            if len(virtual_local_size) != len(virtual_global_size):
                raise ValueError("Global size and local size must have the same dimensions")

        # Since the device uses column-major ordering of sizes, while we get
        # row-major ordered shapes, we temporarily invert our shapes
        # to facilitate internal handling.
        virtual_global_size = tuple(reversed(virtual_global_size))
        if virtual_local_size is not None:
            virtual_local_size = tuple(reversed(virtual_local_size))

        # Restrict local sizes using the provided explicit limit
        if max_local_size is not None:
            max_work_group_size = min(
                max_local_size,
                device_params.max_work_group_size,
                product(device_params.max_work_item_sizes))
            max_work_item_sizes = [
                min(max_local_size, mwis) for mwis in device_params.max_work_item_sizes]
        else:
            # Assuming:
            # 1) max_work_group_size <= product(max_work_item_sizes)
            # 2) max(max_work_item_sizes) <= max_work_group_size
            max_work_group_size = device_params.max_work_group_size
            max_work_item_sizes = device_params.max_work_item_sizes

        if virtual_local_size is None:
            # FIXME: we can obtain better results by taking occupancy into account here,
            # but for now we will assume that the more threads, the better.
            flat_global_size = product(virtual_global_size)
            multiple = device_params.warp_size

            if flat_global_size < max_work_group_size:
                flat_local_size = flat_global_size
            elif max_work_group_size < multiple:
                flat_local_size = 1
            else:
                flat_local_size = multiple * (max_work_group_size // multiple)

            # product(virtual_local_size) == flat_local_size <= max_work_group_size
            virtual_local_size = find_local_size(virtual_global_size, flat_local_size)
        else:
            if product(virtual_local_size) > max_work_group_size:
                raise OutOfResourcesError(
                    "Requested local size is greater than the maximum " + str(max_work_group_size))

        # Global and local sizes supported by CUDA or OpenCL restricted number of dimensions,
        # which may have limited size, so we need to pack our multidimensional sizes.

        virtual_grid_size = tuple(
            min_blocks(gs, ls) for gs, ls in zip(virtual_global_size, virtual_local_size))
        bounding_global_size = tuple(
            grs * ls for grs, ls in zip(virtual_grid_size, virtual_local_size))

        if product(virtual_grid_size) > product(device_params.max_num_groups):
            raise OutOfResourcesError(
                "Bounding global size " + repr(bounding_global_size) + " is too large")

        local_groups = ShapeGroups(virtual_local_size, max_work_item_sizes)
        grid_groups = ShapeGroups(virtual_grid_size, device_params.max_num_groups)

        # Returning back to the row-major ordering
        self.virtual_local_size = tuple(reversed(virtual_local_size))
        self.virtual_global_size = tuple(reversed(virtual_global_size))

        # These can be different lenghts because of expansion into multiple dimensions
        # find_bounding_shape() does.
        real_local_size = tuple(local_groups.bounding_shape)
        real_grid_size = tuple(grid_groups.bounding_shape)

        diff = len(real_local_size) - len(real_grid_size)
        if diff > 0:
            self.real_local_size = real_local_size
            self.real_grid_size = real_grid_size + (1,) * abs(diff)
        else:
            self.real_local_size = real_local_size + (1,) * abs(diff)
            self.real_grid_size = real_grid_size

        self.real_global_size = tuple(
            gs * ls for gs, ls
            in zip(self.real_grid_size, self.real_local_size))

        # This function will be used to translate between internal column-major vdims
        # and user-supplied row-major vdims.
        vdim_inverse = lambda dim: len(self.virtual_local_size) - dim - 1

        self.vsize_functions = render_template(
            TEMPLATE,
            virtual_local_size=virtual_local_size,
            virtual_global_size=virtual_global_size,
            bounding_global_size=bounding_global_size,
            virtual_grid_size=virtual_grid_size,
            local_groups=local_groups,
            grid_groups=grid_groups,
            product=product,
            vdim_inverse=vdim_inverse)