def prepare(self, global_size, local_size=None, local_mem=0): global_size = wrap_in_tuple(global_size) self._local_mem = local_mem max_dims = self._thr.device_params.max_work_item_sizes if len(global_size) > len(max_dims): raise ValueError("Global size has too many dimensions") if local_size is not None: local_size = wrap_in_tuple(local_size) if len(local_size) != len(global_size): raise ValueError( "Global/local work sizes have differing dimensions") else: local_size = find_local_size(global_size, max_dims, self.max_work_group_size) grid = [] for gs, ls in zip(global_size, local_size): if gs % ls != 0: raise ValueError( "Global sizes must be multiples of corresponding local sizes" ) grid.append(gs // ls) # append missing dimensions, otherwise PyCUDA will complain self._local_size = local_size + (1, ) * (3 - len(grid)) self._grid = tuple(grid) + (1, ) * (3 - len(grid))
def prepare(self, global_size, local_size=None, local_mem=0): # ``local_mem`` is ignored, since it cannot be easily passed to the kernel # (a special kernel argument is requred). if local_size is None: self._local_size = None else: self._local_size = wrap_in_tuple(local_size) self._global_size = wrap_in_tuple(global_size)
def __init__(self, dtype, shape=None, strides=None, offset=0, nbytes=None): self.shape = tuple() if shape is None else wrap_in_tuple(shape) self.size = product(self.shape) self.dtype = dtypes.normalize_type(dtype) self.ctype = dtypes.ctype_module(self.dtype) default_strides = helpers.default_strides(self.shape, self.dtype.itemsize) if strides is None: strides = default_strides else: strides = tuple(strides) self._default_strides = strides == default_strides self.strides = strides default_nbytes = helpers.min_buffer_size(self.shape, self.dtype.itemsize, self.strides) if nbytes is None: nbytes = default_nbytes self._default_nbytes = nbytes == default_nbytes self.nbytes = nbytes self.offset = offset self._cast = dtypes.cast(self.dtype)
def array(self, shape, dtype, strides=None, offset=0, nbytes=None, allocator=None, base=None, base_data=None): if allocator is None: allocator = self.allocate dtype = dtypes.normalize_type(dtype) shape = wrap_in_tuple(shape) if nbytes is None: nbytes = min_buffer_size(shape, dtype.itemsize, strides=strides, offset=offset) if (offset != 0 or strides is not None) and base_data is None and base is None: base_data = allocator(nbytes) elif base is not None: base_data = base.data return Array(self, shape, dtype, strides=strides, offset=offset, allocator=allocator, base_data=base_data, nbytes=nbytes)
def get_test_array(shape, dtype, strides=None, no_zeros=False, high=None): shape = wrap_in_tuple(shape) dtype = dtypes.normalize_type(dtype) if dtype.names is not None: result = numpy.empty(shape, dtype) for name in dtype.names: result[name] = get_test_array(shape, dtype[name], no_zeros=no_zeros, high=high) else: if dtypes.is_integer(dtype): low = 1 if no_zeros else 0 if high is None: high = 100 # will work even with signed chars get_arr = lambda: numpy.random.randint(low, high, shape).astype(dtype) else: low = 0.01 if no_zeros else 0 if high is None: high = 1.0 get_arr = lambda: numpy.random.uniform(low, high, shape).astype(dtype) if dtypes.is_complex(dtype): result = get_arr() + 1j * get_arr() else: result = get_arr() if strides is not None: result = as_strided(result, result.shape, strides) return result
def get_test_array(shape, dtype, strides=None, no_zeros=False, high=None): shape = wrap_in_tuple(shape) dtype = dtypes.normalize_type(dtype) if dtype.names is not None: result = numpy.empty(shape, dtype) for name in dtype.names: result[name] = get_test_array(shape, dtype[name], no_zeros=no_zeros, high=high) else: if dtypes.is_integer(dtype): low = 1 if no_zeros else 0 if high is None: high = 100 # will work even with signed chars get_arr = lambda: numpy.random.randint(low, high, shape).astype( dtype) else: low = 0.01 if no_zeros else 0 if high is None: high = 1.0 get_arr = lambda: numpy.random.uniform(low, high, shape).astype( dtype) if dtypes.is_complex(dtype): result = get_arr() + 1j * get_arr() else: result = get_arr() if strides is not None: result = as_strided(result, result.shape, strides) return result
def __init__(self, dtype, shape=None, strides=None): self.shape = tuple() if shape is None else wrap_in_tuple(shape) self.size = product(self.shape) self.dtype = dtypes.normalize_type(dtype) self.ctype = dtypes.ctype_module(self.dtype) if strides is None: self.strides = tuple([ self.dtype.itemsize * product(self.shape[i+1:]) for i in range(len(self.shape))]) else: self.strides = strides self._cast = dtypes.cast(self.dtype)
def __init__(self, dtype, shape=None, strides=None): self.shape = tuple() if shape is None else wrap_in_tuple(shape) self.size = product(self.shape) self.dtype = dtypes.normalize_type(dtype) self.ctype = dtypes.ctype_module(self.dtype) if strides is None: self.strides = tuple([ self.dtype.itemsize * product(self.shape[i + 1:]) for i in range(len(self.shape)) ]) else: self.strides = strides self._cast = dtypes.cast(self.dtype)
def __init__(self, arr_t, predicate, axes=None, output_arr_t=None): dims = len(arr_t.shape) if axes is None: axes = tuple(range(dims)) else: axes = tuple(sorted(helpers.wrap_in_tuple(axes))) if len(set(axes)) != len(axes): raise ValueError("Cannot reduce twice over the same axis") if min(axes) < 0 or max(axes) >= dims: raise ValueError("Axes numbers are out of bounds") if hasattr(predicate.empty, 'dtype'): if arr_t.dtype != predicate.empty.dtype: raise ValueError( "The predicate and the array must use the same data type") empty = predicate.empty else: empty = dtypes.cast(arr_t.dtype)(predicate.empty) remaining_axes = tuple(a for a in range(dims) if a not in axes) output_shape = tuple(arr_t.shape[a] for a in remaining_axes) if axes == tuple(range(dims - len(axes), dims)): self._transpose_axes = None else: self._transpose_axes = remaining_axes + axes self._operation = predicate.operation self._empty = empty if output_arr_t is None: output_arr_t = Type(arr_t.dtype, shape=output_shape) else: if output_arr_t.dtype != arr_t.dtype: raise ValueError( "The dtype of the output array must be the same as that of the input array" ) if output_arr_t.shape != output_shape: raise ValueError("Expected the output array shape " + str(output_shape) + ", got " + str(output_arr_t.shape)) Computation.__init__(self, [ Parameter('output', Annotation(output_arr_t, 'o')), Parameter('input', Annotation(arr_t, 'i')) ])
def prepare(self, global_size, local_size=None, local_mem=0): global_size = wrap_in_tuple(global_size) self._local_mem = local_mem max_dims = self._thr.device_params.max_work_item_sizes if len(global_size) > len(max_dims): raise ValueError("Global size has too many dimensions") if local_size is not None: local_size = wrap_in_tuple(local_size) if len(local_size) != len(global_size): raise ValueError("Global/local work sizes have differing dimensions") else: local_size = find_local_size(global_size, max_dims, self.max_work_group_size) grid = [] for gs, ls in zip(global_size, local_size): if gs % ls != 0: raise ValueError("Global sizes must be multiples of corresponding local sizes") grid.append(gs // ls) # append missing dimensions, otherwise PyCUDA will complain self._local_size = local_size + (1,) * (3 - len(grid)) self._grid = tuple(grid) + (1,) * (3 - len(grid))
def normalize_constant_arrays(constant_arrays): if constant_arrays is None: return {} normalized = {} for name, metadata in constant_arrays.items(): if hasattr(metadata, 'shape') and hasattr(metadata, 'dtype'): shape, dtype = metadata.shape, metadata.dtype else: shape, dtype = metadata shape = wrap_in_tuple(shape) length = product(shape) normalized[name] = (length, dtype) return normalized
def __init__(self, arr_t, predicate, axes=None, output_arr_t=None): dims = len(arr_t.shape) if axes is None: axes = tuple(range(dims)) else: axes = tuple(sorted(helpers.wrap_in_tuple(axes))) if len(set(axes)) != len(axes): raise ValueError("Cannot reduce twice over the same axis") if min(axes) < 0 or max(axes) >= dims: raise ValueError("Axes numbers are out of bounds") if hasattr(predicate.empty, 'dtype'): if arr_t.dtype != predicate.empty.dtype: raise ValueError("The predicate and the array must use the same data type") empty = predicate.empty else: empty = dtypes.cast(arr_t.dtype)(predicate.empty) remaining_axes = tuple(a for a in range(dims) if a not in axes) output_shape = tuple(arr_t.shape[a] for a in remaining_axes) if axes == tuple(range(dims - len(axes), dims)): self._transpose_axes = None else: self._transpose_axes = remaining_axes + axes self._operation = predicate.operation self._empty = empty if output_arr_t is None: output_arr_t = Type(arr_t.dtype, shape=output_shape) else: if output_arr_t.dtype != arr_t.dtype: raise ValueError( "The dtype of the output array must be the same as that of the input array") if output_arr_t.shape != output_shape: raise ValueError( "Expected the output array shape " + str(output_shape) + ", got " + str(output_arr_t.shape)) Computation.__init__(self, [ Parameter('output', Annotation(output_arr_t, 'o')), Parameter('input', Annotation(arr_t, 'i'))])
def array(self, shape, dtype, strides=None, offset=0, nbytes=None, allocator=None, base=None, base_data=None): # In PyCUDA, the default allocator is not None, but a default alloc object if allocator is None: allocator = cuda.mem_alloc dtype = dtypes.normalize_type(dtype) shape = wrap_in_tuple(shape) if nbytes is None: nbytes = int( min_buffer_size(shape, dtype.itemsize, strides=strides, offset=offset)) if (offset != 0 or strides is not None) and base_data is None and base is None: base_data = allocator(nbytes) elif base is not None: if isinstance(base, Array): base_data = base.base_data else: base_data = base.gpudata return Array(self, shape, dtype, strides=strides, allocator=allocator, offset=offset, base_data=base_data, nbytes=nbytes)
def array( self, shape, dtype, strides=None, offset=0, nbytes=None, allocator=None, base=None, base_data=None): # In PyCUDA, the default allocator is not None, but a default alloc object if allocator is None: allocator = cuda.mem_alloc dtype = dtypes.normalize_type(dtype) shape = wrap_in_tuple(shape) if nbytes is None: nbytes = int(min_buffer_size(shape, dtype.itemsize, strides=strides, offset=offset)) if (offset != 0 or strides is not None) and base_data is None and base is None: base_data = allocator(nbytes) elif base is not None: if isinstance(base, Array): base_data = base.base_data else: base_data = base.gpudata return Array( self, shape, dtype, strides=strides, allocator=allocator, offset=offset, base_data=base_data, nbytes=nbytes)
def __init__(self, device_params, virtual_global_size, virtual_local_size=None, max_local_size=None): virtual_global_size = wrap_in_tuple(virtual_global_size) if virtual_local_size is not None: virtual_local_size = wrap_in_tuple(virtual_local_size) if len(virtual_local_size) != len(virtual_global_size): raise ValueError("Global size and local size must have the same dimensions") # Since the device uses column-major ordering of sizes, while we get # row-major ordered shapes, we temporarily invert our shapes # to facilitate internal handling. virtual_global_size = tuple(reversed(virtual_global_size)) if virtual_local_size is not None: virtual_local_size = tuple(reversed(virtual_local_size)) # Restrict local sizes using the provided explicit limit if max_local_size is not None: max_work_group_size = min( max_local_size, device_params.max_work_group_size, product(device_params.max_work_item_sizes)) max_work_item_sizes = [ min(max_local_size, mwis) for mwis in device_params.max_work_item_sizes] else: # Assuming: # 1) max_work_group_size <= product(max_work_item_sizes) # 2) max(max_work_item_sizes) <= max_work_group_size max_work_group_size = device_params.max_work_group_size max_work_item_sizes = device_params.max_work_item_sizes if virtual_local_size is None: # FIXME: we can obtain better results by taking occupancy into account here, # but for now we will assume that the more threads, the better. flat_global_size = product(virtual_global_size) multiple = device_params.warp_size if flat_global_size < max_work_group_size: flat_local_size = flat_global_size elif max_work_group_size < multiple: flat_local_size = 1 else: flat_local_size = multiple * (max_work_group_size // multiple) # product(virtual_local_size) == flat_local_size <= max_work_group_size virtual_local_size = find_local_size(virtual_global_size, flat_local_size) else: if product(virtual_local_size) > max_work_group_size: raise OutOfResourcesError( "Requested local size is greater than the maximum " + str(max_work_group_size)) # Global and local sizes supported by CUDA or OpenCL restricted number of dimensions, # which may have limited size, so we need to pack our multidimensional sizes. virtual_grid_size = tuple( min_blocks(gs, ls) for gs, ls in zip(virtual_global_size, virtual_local_size)) bounding_global_size = tuple( grs * ls for grs, ls in zip(virtual_grid_size, virtual_local_size)) if product(virtual_grid_size) > product(device_params.max_num_groups): raise OutOfResourcesError( "Bounding global size " + repr(bounding_global_size) + " is too large") local_groups = ShapeGroups(virtual_local_size, max_work_item_sizes) grid_groups = ShapeGroups(virtual_grid_size, device_params.max_num_groups) # Returning back to the row-major ordering self.virtual_local_size = tuple(reversed(virtual_local_size)) self.virtual_global_size = tuple(reversed(virtual_global_size)) # These can be different lenghts because of expansion into multiple dimensions # find_bounding_shape() does. real_local_size = tuple(local_groups.bounding_shape) real_grid_size = tuple(grid_groups.bounding_shape) diff = len(real_local_size) - len(real_grid_size) if diff > 0: self.real_local_size = real_local_size self.real_grid_size = real_grid_size + (1,) * abs(diff) else: self.real_local_size = real_local_size + (1,) * abs(diff) self.real_grid_size = real_grid_size self.real_global_size = tuple( gs * ls for gs, ls in zip(self.real_grid_size, self.real_local_size)) # This function will be used to translate between internal column-major vdims # and user-supplied row-major vdims. vdim_inverse = lambda dim: len(self.virtual_local_size) - dim - 1 self.vsize_functions = render_template( TEMPLATE, virtual_local_size=virtual_local_size, virtual_global_size=virtual_global_size, bounding_global_size=bounding_global_size, virtual_grid_size=virtual_grid_size, local_groups=local_groups, grid_groups=grid_groups, product=product, vdim_inverse=vdim_inverse)