def _kernel(result, source, powers, powers_idx): if powers_view: powers = powers.reshape(product(batch_shape), powers_shape[-1])[:, powers_idx] else: powers = powers.flatten() result = result.reshape(product(batch_shape), product(poly_batch_shape), polynomial_degree) source = source.reshape(product(batch_shape), product(poly_batch_shape), polynomial_degree) if invert_powers: powers = 2 * polynomial_degree - powers for i in range(result.shape[0]): power = powers[i] if power < polynomial_degree: result[i, :, :power] = -source[i, :, (polynomial_degree - power):polynomial_degree] result[i, :, power:polynomial_degree] = source[i, :, :( polynomial_degree - power)] else: power = power - polynomial_degree result[i, :, :power] = source[i, :, (polynomial_degree - power):polynomial_degree] result[i, :, power:polynomial_degree] = -source[i, :, :( polynomial_degree - power)] if minus_one: result -= source
def test_group_dimensions(virtual_shape, available_shape): """ Tests that ``group_dimensions()`` obeys its contracts. """ v_groups, a_groups = vsize.group_dimensions(virtual_shape, available_shape) v_dims = [] a_dims = [] for v_group, a_group in zip(v_groups, a_groups): # Check that axis indices in groups are actually in range assert any(vdim < len(virtual_shape) for vdim in v_group) assert any(adim < len(available_shape) for adim in a_group) # Check that the total number of elements (threads) in the virtual group # is not greater than the number of elements in the real group v_shape = virtual_shape[v_group[0]:v_group[-1]+1] a_shape = available_shape[a_group[0]:a_group[-1]+1] assert(product(v_shape) <= product(a_shape)) v_dims += v_group a_dims += a_group # Check that both virtual and real groups axes add up to a successive list # without intersections. assert v_dims == list(range(len(virtual_shape))) assert a_dims == list(range(len(available_shape[:len(a_dims)])))
def __call__(self, psi): psi_nobs = psi.get() psi = beam_splitter(psi_nobs) ns_nobs = numpy.abs(psi_nobs) ** 2 - 0.5 / product(self.ds) ns = numpy.abs(psi) ** 2 - 0.5 / product(self.ds) if len(psi.shape) == 3: # 1D n_nobs = ns_nobs.mean(1) n = ns.mean(1) Ns = (ns * self.ds[0]).sum(-1) elif len(psi.shape) == 4: # 2D n_nobs = ns_nobs.mean(1) n = ns.mean(1) Ns = (ns * product(self.ds)).sum(-1).sum(-1) elif len(psi.shape) == 5: # 3D n_nobs = (ns_nobs.mean(1) * self.ds[1] * self.ds[2]).sum(-1).sum(-1) n = (ns.mean(1) * self.ds[1] * self.ds[2]).sum(-1).sum(-1) Ns = (ns * product(self.ds)).sum(-1).sum(-1).sum(-1) res = dict( Nplus_mean=Ns[0].mean(), Nminus_mean=Ns[1].mean(), Nplus_std=Ns[0].std(), Nminus_std=Ns[1].std(), density=n, density_nobs=n_nobs) if len(psi.shape) == 5: res['slice_nobs'] = ns_nobs.mean(1)[:,:,:,ns.shape[-1] / 2] res['slice'] = ns.mean(1)[:,:,:,ns.shape[-1] / 2] return res
def __init__(self, dtype, device_params, outer_shape, fft_size, curr_size, fft_size_real, inner_shape, pass_num, reverse_direction): num_passes = len(get_global_radix_info(fft_size)[0]) real_output_shape = (pass_num == num_passes - 1 and reverse_direction) self.name = 'fft_global' self.inplace_possible = (pass_num == num_passes - 1 and num_passes % 2 == 1) self.output_shape = (outer_shape + (fft_size_real if real_output_shape else fft_size,) + inner_shape) if fft_size != fft_size_real and pass_num == 0 and reverse_direction: self.kweights = get_kweights(fft_size_real, fft_size) else: self.kweights = None self._fft_size = fft_size self._curr_size = curr_size self._fft_size_real = fft_size_real self._local_mem_size = device_params.local_mem_size self._itemsize = dtype.itemsize self._inner_batch = helpers.product(inner_shape) self._outer_batch = helpers.product(outer_shape) self._pass_num = pass_num self._last_pass = (pass_num == num_passes - 1) self._constant_kwds = get_common_kwds(dtype, device_params) self._constant_kwds.update(dict( takes_kweights=(self.kweights is not None), input_slices=(len(outer_shape), 1, len(inner_shape)), output_slices=(len(outer_shape), 1, len(inner_shape)), pad_in=(fft_size != fft_size_real and pass_num == 0 and not reverse_direction), unpad_out=(fft_size != fft_size_real and self._last_pass and reverse_direction), reverse_direction=reverse_direction, normalize=self._last_pass))
def test_group_dimensions(virtual_shape, available_shape): """ Tests that ``group_dimensions()`` obeys its contracts. """ v_groups, a_groups = vsize.group_dimensions(virtual_shape, available_shape) v_dims = [] a_dims = [] for v_group, a_group in zip(v_groups, a_groups): # Check that axis indices in groups are actually in range assert any(vdim < len(virtual_shape) for vdim in v_group) assert any(adim < len(available_shape) for adim in a_group) # Check that the total number of elements (threads) in the virtual group # is not greater than the number of elements in the real group v_shape = virtual_shape[v_group[0]:v_group[-1]+1] a_shape = available_shape[a_group[0]:a_group[-1]+1] assert(product(v_shape) <= product(a_shape)) v_dims += v_group a_dims += a_group # Check that both virtual and real groups axes add up to a successive list # without intersections. assert v_dims == list(range(len(virtual_shape))) assert a_dims == list(range(len(available_shape[:len(a_dims)])))
def check_performance(thr_and_double, shape_and_axes): thr, double = thr_and_double dtype = numpy.complex128 if double else numpy.complex64 dtype = dtypes.normalize_type(dtype) shape, axes = shape_and_axes data = numpy.arange(product(shape)).reshape(shape).astype(dtype) shift = FFTShift(data, axes=axes) shiftc = shift.compile(thr) data_dev = thr.to_device(data) res_dev = thr.empty_like(data) attempts = 10 times = [] for i in range(attempts): t1 = time.time() shiftc(res_dev, data_dev) thr.synchronize() times.append(time.time() - t1) res_ref = numpy.fft.fftshift(data, axes=axes) assert diff_is_negligible(res_dev.get(), res_ref) return min(times), product(shape) * dtype.itemsize
def prepare_for(self, max_local_size): kwds = dict(self._constant_kwds) radix_arr, radix1_arr, radix2_arr = get_global_radix_info( self._fft_size) radix = radix_arr[self._pass_num] radix1 = radix1_arr[self._pass_num] radix2 = radix2_arr[self._pass_num] stride_out = self._inner_batch * helpers.product( radix_arr[:self._pass_num]) stride = stride_out * radix stride_in = stride_out * helpers.product( radix_arr[self._pass_num + 1:]) threads_per_xform = radix2 coalesce_width = kwds['min_mem_coalesce_width'] local_batch = max_local_size if radix2 == 1 else coalesce_width local_batch = min(local_batch, stride_in) local_size = min(local_batch * threads_per_xform, max_local_size) local_batch = local_size // threads_per_xform workgroups_num = helpers.min_blocks(stride_in, local_batch) * self._outer_batch if radix2 == 1: lmem_size = 0 else: if stride_out == 1: lmem_size = (radix + 1) * local_batch else: lmem_size = local_size * radix1 if lmem_size * self._itemsize // 2 > self._local_mem_size: raise OutOfResourcesError kwds.update(self._constant_kwds) kwds.update( dict(fft_size=self._fft_size, curr_size=self._curr_size, fft_size_real=self._fft_size_real, pass_num=self._pass_num, lmem_size=lmem_size, local_batch=local_batch, local_size=local_size, inner_batch=self._inner_batch, radix_arr=radix_arr, radix1_arr=radix1_arr, radix2_arr=radix2_arr, radix1=radix1, radix2=radix2, radix=radix, stride_in=stride_in, stride_out=stride_out, stride=stride, last_pass=self._last_pass)) return workgroups_num * local_size, local_size, kwds
def __init__(self, virtual_shape, available_shape): self.real_dims = {} self.real_strides = {} self.virtual_strides = {} self.major_vdims = {} self.bounding_shape = tuple() self.skip_thresholds = [] v_groups, a_groups = group_dimensions(virtual_shape, available_shape) for v_group, a_group in zip(v_groups, a_groups): virtual_subshape = virtual_shape[v_group[0]:v_group[-1]+1] virtual_subsize = product(virtual_subshape) bounding_subshape = find_bounding_shape( virtual_subsize, available_shape[a_group[0]:a_group[-1]+1]) self.bounding_shape += bounding_subshape if virtual_subsize < product(bounding_subshape): strides = [(adim, product(bounding_subshape[:i])) for i, adim in enumerate(a_group)] self.skip_thresholds.append((virtual_subsize, strides)) for vdim in v_group: self.real_dims[vdim] = a_group self.real_strides[vdim] = tuple( product(self.bounding_shape[a_group[0]:adim]) for adim in a_group) self.virtual_strides[vdim] = product(virtual_shape[v_group[0]:vdim]) self.major_vdims[vdim] = v_group[-1]
def find_local_size(global_size, flat_local_size, threshold=0.05): """ Returns a tuple of the same size as ``global_size``, with the product equal to ``flat_local_size``, and minimal difference between ``product(global_size)`` and ``product(min_blocks(gs, ls) for gs, ls in zip(global_size, local_size))`` (i.e. tries to minimize the amount of empty threads). """ flat_global_size = product(global_size) if flat_local_size >= flat_global_size: return global_size threads_num = product(global_size) best_ratio = None best_local_size = None for local_size in get_decompositions(flat_local_size, len(global_size)): bounding_global_size = tuple( ls * min_blocks(gs, ls) for gs, ls in zip(global_size, local_size)) empty_threads = product(bounding_global_size) - threads_num ratio = float(empty_threads) / threads_num # Stopping iteration early, because there may be a lot of elements to iterate over, # and we do not need the perfect solution. if ratio < threshold: return local_size if best_ratio is None or ratio < best_ratio: best_ratio = ratio best_local_size = local_size return best_local_size
def __init__(self, dtype, device_params, outer_shape, fft_size, curr_size, fft_size_real, inner_shape, pass_num, reverse_direction): num_passes = len(get_global_radix_info(fft_size)[0]) real_output_shape = (pass_num == num_passes - 1 and reverse_direction) self.name = 'fft_global' self.inplace_possible = (pass_num == num_passes - 1 and num_passes % 2 == 1) self.output_shape = (outer_shape + (fft_size_real if real_output_shape else fft_size,) + inner_shape) if fft_size != fft_size_real and pass_num == 0 and reverse_direction: self.kweights = get_kweights(fft_size_real, fft_size) else: self.kweights = None self._fft_size = fft_size self._curr_size = curr_size self._fft_size_real = fft_size_real self._local_mem_size = device_params.local_mem_size self._itemsize = dtype.itemsize self._inner_batch = helpers.product(inner_shape) self._outer_batch = helpers.product(outer_shape) self._pass_num = pass_num self._last_pass = (pass_num == num_passes - 1) self._constant_kwds = get_common_kwds(dtype, device_params) self._constant_kwds.update(dict( takes_kweights=(self.kweights is not None), input_slices=(len(outer_shape), 1, len(inner_shape)), output_slices=(len(outer_shape), 1, len(inner_shape)), pad_in=(fft_size != fft_size_real and pass_num == 0 and not reverse_direction), unpad_out=(fft_size != fft_size_real and self._last_pass and reverse_direction), reverse_direction=reverse_direction, normalize=self._last_pass))
def _add_transpose(self, plan, device_params, mem_out, mem_in, batch_shape, height_shape, width_shape): bso = self._block_width_override block_width = device_params.local_mem_banks if bso is None else bso if block_width ** 2 > device_params.max_work_group_size: # If it is not CPU, current solution may affect performance block_width = int(numpy.sqrt(device_params.max_work_group_size)) input_height = helpers.product(height_shape) input_width = helpers.product(width_shape) batch = helpers.product(batch_shape) blocks_per_matrix = helpers.min_blocks(input_height, block_width) grid_width = helpers.min_blocks(input_width, block_width) render_kwds = dict( input_width=input_width, input_height=input_height, batch=batch, block_width=block_width, grid_width=grid_width, blocks_per_matrix=blocks_per_matrix, input_slices=[len(batch_shape), len(height_shape), len(width_shape)], output_slices=[len(batch_shape), len(width_shape), len(height_shape)]) plan.kernel_call( TEMPLATE.get_def('transpose'), [mem_out, mem_in], global_size=(batch, blocks_per_matrix * block_width, grid_width * block_width), local_size=(1, block_width, block_width), render_kwds=render_kwds)
def _add_transpose(self, plan, device_params, mem_out, mem_in, batch_shape, height_shape, width_shape): bso = self._block_width_override block_width = device_params.local_mem_banks if bso is None else bso if block_width ** 2 > device_params.max_work_group_size: # If it is not CPU, current solution may affect performance block_width = int(numpy.sqrt(device_params.max_work_group_size)) input_height = helpers.product(height_shape) input_width = helpers.product(width_shape) batch = helpers.product(batch_shape) blocks_per_matrix = helpers.min_blocks(input_height, block_width) grid_width = helpers.min_blocks(input_width, block_width) render_kwds = dict( input_width=input_width, input_height=input_height, batch=batch, block_width=block_width, grid_width=grid_width, blocks_per_matrix=blocks_per_matrix, input_slices=[len(batch_shape), len(height_shape), len(width_shape)], output_slices=[len(batch_shape), len(width_shape), len(height_shape)]) plan.kernel_call( TEMPLATE.get_def('transpose'), [mem_out, mem_in], kernel_name="kernel_transpose", global_size=(batch, blocks_per_matrix * block_width, grid_width * block_width), local_size=(1, block_width, block_width), render_kwds=render_kwds)
def check_performance(thr_and_double, shape_and_axes): thr, double = thr_and_double dtype = numpy.complex128 if double else numpy.complex64 dtype = dtypes.normalize_type(dtype) shape, axes = shape_and_axes data = numpy.arange(product(shape)).reshape(shape).astype(dtype) shift = FFTShift(data, axes=axes) shiftc = shift.compile(thr) data_dev = thr.to_device(data) res_dev = thr.empty_like(data) attempts = 10 times = [] for i in range(attempts): t1 = time.time() shiftc(res_dev, data_dev) thr.synchronize() times.append(time.time() - t1) res_ref = numpy.fft.fftshift(data, axes=axes) assert diff_is_negligible(res_dev.get(), res_ref) return min(times), product(shape) * dtype.itemsize
def group_dimensions(virtual_shape, available_shape): """ Returns two lists, one of tuples with numbers of grouped virtual dimensions, the other one of tuples with numbers of corresponding group of available dimensions, such that for any group of virtual dimensions, the total number of elements they cover does not exceed the number of elements covered by the corresponding group of available dimensions. """ assert product(virtual_shape) <= product(available_shape) return _group_dimensions(0, virtual_shape, 0, available_shape)
def __init__(self, dtype, shape=None, strides=None): self.shape = tuple() if shape is None else wrap_in_tuple(shape) self.size = product(self.shape) self.dtype = dtypes.normalize_type(dtype) self.ctype = dtypes.ctype_module(self.dtype) if strides is None: self.strides = tuple([ self.dtype.itemsize * product(self.shape[i+1:]) for i in range(len(self.shape))]) else: self.strides = strides self._cast = dtypes.cast(self.dtype)
def _build_plan(self, plan_factory, device_params, output, matrix_a, matrix_b): bwo = self._block_width_override if bwo is not None: block_widths = [bwo] else: nbanks = device_params.local_mem_banks block_widths = [2**n for n in range(helpers.log2(nbanks), -1, -1)] a_batch = helpers.product(matrix_a.shape[:-2]) b_batch = helpers.product(matrix_b.shape[:-2]) batch = max(a_batch, b_batch) for block_width in block_widths: plan = plan_factory() if block_width**2 > device_params.max_work_group_size: continue num_steps = helpers.min_blocks(self._convolution_size, block_width) a_blocks = helpers.min_blocks(self._a_outer_size, block_width) b_blocks = helpers.min_blocks(self._b_outer_size, block_width) render_kwds = dict(batched_a=(a_batch != 1), batched_b=(b_batch != 1), transposed_a=self._transposed_a, transposed_b=self._transposed_b, num_steps=num_steps, a_slices=(len(matrix_a.shape) - 2, 1, 1), b_slices=(len(matrix_b.shape) - 2, 1, 1), output_slices=(len(output.shape) - 2, 1, 1), block_width=block_width, mul=functions.mul(matrix_a.dtype, matrix_b.dtype, out_dtype=output.dtype)) try: plan.kernel_call(TEMPLATE.get_def('matrixmul'), [output, matrix_a, matrix_b], kernel_name="kernel_matrixmul", global_size=(batch, a_blocks * block_width, b_blocks * block_width), local_size=(1, block_width, block_width), render_kwds=render_kwds) except OutOfResourcesError: continue return plan raise ValueError( "Could not find suitable call parameters for the kernel")
def _build_plan(self, plan_factory, device_params, output, matrix_a, matrix_b): bwo = self._block_width_override if bwo is not None: block_widths = [bwo] else: nbanks = device_params.local_mem_banks block_widths = [2 ** n for n in range(helpers.log2(nbanks), -1, -1)] a_batch = helpers.product(matrix_a.shape[:-2]) b_batch = helpers.product(matrix_b.shape[:-2]) batch = max(a_batch, b_batch) for block_width in block_widths: plan = plan_factory() if block_width ** 2 > device_params.max_work_group_size: continue num_steps = helpers.min_blocks(self._convolution_size, block_width) a_blocks = helpers.min_blocks(self._a_outer_size, block_width) b_blocks = helpers.min_blocks(self._b_outer_size, block_width) render_kwds = dict( batched_a=(a_batch != 1), batched_b=(b_batch != 1), transposed_a=self._transposed_a, transposed_b=self._transposed_b, num_steps=num_steps, a_slices=(len(matrix_a.shape) - 2, 1, 1), b_slices=(len(matrix_b.shape) - 2, 1, 1), output_slices=(len(output.shape) - 2, 1, 1), block_width=block_width, mul=functions.mul(matrix_a.dtype, matrix_b.dtype, out_dtype=output.dtype)) try: plan.kernel_call( TEMPLATE.get_def('matrixmul'), [output, matrix_a, matrix_b], kernel_name="kernel_matrixmul", global_size=( batch, a_blocks * block_width, b_blocks * block_width), local_size=(1, block_width, block_width), render_kwds=render_kwds) except OutOfResourcesError: continue return plan raise ValueError("Could not find suitable call parameters for the kernel")
def __init__(self, dtype, shape=None, strides=None): self.shape = tuple() if shape is None else wrap_in_tuple(shape) self.size = product(self.shape) self.dtype = dtypes.normalize_type(dtype) self.ctype = dtypes.ctype_module(self.dtype) if strides is None: self.strides = tuple([ self.dtype.itemsize * product(self.shape[i + 1:]) for i in range(len(self.shape)) ]) else: self.strides = strides self._cast = dtypes.cast(self.dtype)
def compatible_with(self, other): if self.dtype != other.dtype: return False common_shape_len = min(len(self.shape), len(other.shape)) if self.shape[-common_shape_len:] != other.shape[-common_shape_len:]: return False if self.strides[-common_shape_len:] != other.strides[-common_shape_len:]: return False if helpers.product(self.shape[:-common_shape_len]) != 1: return False if helpers.product(other.shape[:-common_shape_len]) != 1: return False return True
def test_find_local_size(global_size, flat_local_size, expected_local_size): """ Checking that ``find_local_size`` finds the sizes we expect from it. """ local_size = vsize.find_local_size(global_size, flat_local_size) assert product(local_size) == flat_local_size assert local_size == expected_local_size
def __init__(self, dtype, device_params, outer_shape, fft_size, fft_size_real, inner_shape, reverse_direction): self.name = "fft_local" self.inplace_possible = True self.output_shape = outer_shape + (fft_size_real if reverse_direction else fft_size,) if fft_size_real != fft_size and reverse_direction: self.kweights = get_kweights(fft_size_real, fft_size) else: self.kweights = None self._fft_size = fft_size self._fft_size_real = fft_size_real self._outer_batch = helpers.product(outer_shape) self._local_mem_size = device_params.local_mem_size self._itemsize = dtype.itemsize self._constant_kwds = get_common_kwds(dtype, device_params) self._constant_kwds.update(dict( takes_kweights=(self.kweights is not None), input_slices=(len(outer_shape), 1, len(inner_shape)), output_slices=(len(outer_shape), 1, len(inner_shape)), pad_in=(fft_size != fft_size_real and not reverse_direction), unpad_out=(fft_size != fft_size_real and reverse_direction), reverse_direction=reverse_direction, normalize=True))
def __init__(self, dtype, shape=None, strides=None, offset=0, nbytes=None): self.shape = tuple() if shape is None else wrap_in_tuple(shape) self.size = product(self.shape) self.dtype = dtypes.normalize_type(dtype) self.ctype = dtypes.ctype_module(self.dtype) default_strides = helpers.default_strides(self.shape, self.dtype.itemsize) if strides is None: strides = default_strides else: strides = tuple(strides) self._default_strides = strides == default_strides self.strides = strides default_nbytes = helpers.min_buffer_size(self.shape, self.dtype.itemsize, self.strides) if nbytes is None: nbytes = default_nbytes self._default_nbytes = nbytes == default_nbytes self.nbytes = nbytes self.offset = offset self._cast = dtypes.cast(self.dtype)
def find_bounding_shape(virtual_size, available_shape): """ Finds a tuple of the same length as ``available_shape``, with every element not greater than the corresponding element of ``available_shape``, and product not lower than ``virtual_size``. """ assert virtual_size <= product(available_shape) free_size = virtual_size free_dims = set(range(len(available_shape))) bounding_shape = [None] * len(available_shape) while len(free_dims) > 0: guess = ceiling_root(free_size, len(free_dims)) for fdim in free_dims: bounding_shape[fdim] = guess for fdim in free_dims: if bounding_shape[fdim] > available_shape[fdim]: bounding_shape[fdim] = available_shape[fdim] free_dims.remove(fdim) free_size = min_blocks(free_size, bounding_shape[fdim]) break else: return tuple(bounding_shape) return tuple(available_shape)
def _build_plan(self, plan_factory, device_params, lwe_a, lwe_b, accum_a, gsw, bara): params = self._params tlwe_params = params.tlwe_params decomp_length = params.decomp_length mask_size = tlwe_params.mask_size perf_params = self._perf_params transform_type = self._params.tlwe_params.transform_type transform = get_transform(transform_type) transform_module = transform.transform_module(perf_params, multi_iter=True) batch_shape = accum_a.shape[:-2] min_local_size = decomp_length * (mask_size + 1) * transform_module.threads_per_transform local_size = device_params.max_work_group_size while local_size >= min_local_size: plan = plan_factory() if transform_module.use_constant_memory: cdata_forward = plan.constant_array(transform_module.cdata_fw) cdata_inverse = plan.constant_array(transform_module.cdata_inv) else: cdata_forward = plan.persistent_array(transform_module.cdata_fw) cdata_inverse = plan.persistent_array(transform_module.cdata_inv) try: plan.kernel_call( TEMPLATE.get_def("BlindRotate"), [lwe_a, lwe_b, accum_a, gsw, bara, cdata_forward, cdata_inverse], global_size=( helpers.product(batch_shape), local_size), local_size=(1, local_size), render_kwds=dict( local_size=local_size, slices=(len(batch_shape), 1, 1), slices2=(len(batch_shape), 1), slices3=(len(batch_shape),), transform=transform_module, mask_size=mask_size, decomp_length=decomp_length, output_size=self._in_out_params.size, input_size=tlwe_params.extracted_lweparams.size, bs_log2_base=self._params.bs_log2_base, mul=transform.transformed_mul(perf_params), add=transform.transformed_add(perf_params), tr_ctype=transform.transformed_internal_ctype(), min_blocks=helpers.min_blocks, ) ) except OutOfResourcesError: local_size -= transform_module.threads_per_transform continue return plan raise ValueError("Could not find suitable local size for the kernel")
def _build_plan(self, plan_factory, device_params, output, input_): plan = plan_factory() batch_size = helpers.product(output.shape[:-1]) blocks_num = helpers.min_blocks(batch_size, self._transforms_per_block) cdata_arr = self._transform.cdata_inv if self._inverse else self._transform.cdata_fw if self._transform.use_constant_memory: cdata = plan.constant_array(cdata_arr) else: cdata = plan.persistent_array(cdata_arr) plan.kernel_call( TEMPLATE.get_def('standalone_transform'), [output, input_, cdata], global_size=(blocks_num, self._transform.threads_per_transform * self._transforms_per_block), local_size=(1, self._transform.threads_per_transform * self._transforms_per_block), render_kwds=dict(inverse=self._inverse, i32_conversion=self._i32_conversion, kernel_repetitions=self._kernel_repetitions, transform=self._transform, transforms_per_block=self._transforms_per_block, batch_size=batch_size, blocks_num=blocks_num, slices=(len(output.shape) - 1, 1))) return plan
def _build_plan(self, plan_factory, device_params, output, input_, inverse): if helpers.product([input_.shape[i] for i in self._axes]) == 1: return self._build_trivial_plan(plan_factory, output, input_) plan = plan_factory() axes = tuple(sorted(self._axes)) shape = list(input_.shape) if all(shape[axis] % 2 == 0 for axis in axes): # If all shift axes have even length, it is possible to perform the shift inplace # (by swapping pairs of elements). # Note that the inplace fftshift is its own inverse. shape[axes[0]] //= 2 plan.kernel_call( TEMPLATE.get_def('fftshift_inplace'), [output, input_], kernel_name="kernel_fftshift_inplace", global_size=shape, render_kwds=dict(axes=axes)) else: # Resort to an out-of-place shift to a temporary array and then copy. temp = plan.temp_array_like(output) plan.kernel_call( TEMPLATE.get_def('fftshift_outplace'), [temp, input_, inverse], kernel_name="kernel_fftshift_outplace", global_size=shape, render_kwds=dict(axes=axes)) copy_trf = copy(input_, out_arr_t=output) copy_comp = PureParallel.from_trf(copy_trf, copy_trf.input) plan.computation_call(copy_comp, output, temp) return plan
def check_performance(thr_and_double, shape_and_axes, fast_math): thr, double = thr_and_double shape, axes = shape_and_axes dtype = numpy.complex128 if double else numpy.complex64 data = get_test_array(shape, dtype) data_dev = thr.to_device(data) res_dev = thr.empty_like(data_dev) fft = FFT(data_dev, axes=axes) fftc = fft.compile(thr, fast_math=fast_math) attempts = 10 t1 = time.time() for i in range(attempts): fftc(res_dev, data_dev) thr.synchronize() t2 = time.time() dev_time = (t2 - t1) / attempts fwd_ref = numpy.fft.fftn(data, axes=axes).astype(dtype) assert diff_is_negligible(res_dev.get(), fwd_ref) return dev_time, product(shape) * sum([numpy.log2(shape[a]) for a in axes]) * 5
def compatible_with(self, other): if self.dtype != other.dtype: return False common_shape_len = min(len(self.shape), len(other.shape)) if self.shape[-common_shape_len:] != other.shape[-common_shape_len:]: return False if self.strides[-common_shape_len:] != other.strides[ -common_shape_len:]: return False if helpers.product(self.shape[:-common_shape_len]) != 1: return False if helpers.product(other.shape[:-common_shape_len]) != 1: return False return True
def _build_plan(self, plan_factory, device_params, output, input_, inverse): if helpers.product([input_.shape[i] for i in self._axes]) == 1: return self._build_trivial_plan(plan_factory, output, input_) # While resource consumption of GlobalFFTKernel can be made lower by passing # lower value to prepare_for(), LocalFFTKernel may have to be split into several kernels. # Therefore, if GlobalFFTKernel.prepare_for() raises OutOfResourcesError, # we just call prepare_for() with lower limit, but if LocalFFTKernel.prepare_for() # does that, we have to recreate the whole chain. local_kernel_limit = device_params.max_work_group_size while local_kernel_limit >= 1: try: plan = self._build_limited_plan( plan_factory, device_params, local_kernel_limit, output, input_, inverse) except LocalKernelFail: # One of LocalFFTKernels was out of resources. # Reduce the limit and try to create operations from scratch again. local_kernel_limit //= 2 continue except GlobalKernelFail: raise ValueError( "Could not find suitable call parameters for one of the global kernels") return plan raise ValueError("Could not find suitable call parameters for one of the local kernels")
def __init__(self, dtype, device_params, outer_shape, fft_size, fft_size_real, inner_shape, reverse_direction): self.name = "fft_local" self.inplace_possible = True self.output_shape = outer_shape + (fft_size_real if reverse_direction else fft_size, ) if fft_size_real != fft_size and reverse_direction: self.kweights = get_kweights(fft_size_real, fft_size) else: self.kweights = None self._fft_size = fft_size self._fft_size_real = fft_size_real self._outer_batch = helpers.product(outer_shape) self._local_mem_size = device_params.local_mem_size self._itemsize = dtype.itemsize self._constant_kwds = get_common_kwds(dtype, device_params) self._constant_kwds.update( dict(takes_kweights=(self.kweights is not None), input_slices=(len(outer_shape), 1, len(inner_shape)), output_slices=(len(outer_shape), 1, len(inner_shape)), pad_in=(fft_size != fft_size_real and not reverse_direction), unpad_out=(fft_size != fft_size_real and reverse_direction), reverse_direction=reverse_direction, normalize=True))
def get_fft_1d_kernels(dtype, device_params, outer_shape, fft_size, inner_shape, local_kernel_limit, reverse_direction=False, fft_size_real=None): """Create and compile kernels for one of the dimensions""" kernels = [] if fft_size_real is None: fft_size_real = fft_size if (helpers.product(inner_shape) == 1 and fft_size // MAX_RADIX <= local_kernel_limit): kernels.append( LocalFFTKernel(dtype, device_params, outer_shape, fft_size, fft_size_real, inner_shape, reverse_direction)) else: kernels.extend( GlobalFFTKernel.create_chain(dtype, device_params, outer_shape, fft_size, fft_size_real, inner_shape, reverse_direction)) return kernels
def test_find_local_size(global_size, flat_local_size, expected_local_size): """ Checking that ``find_local_size`` finds the sizes we expect from it. """ local_size = vsize.find_local_size(global_size, flat_local_size) assert product(local_size) == flat_local_size assert local_size == expected_local_size
def _build_plan(self, plan_factory, device_params, output, input_, inverse): if helpers.product([input_.shape[i] for i in self._axes]) == 1: return self._build_trivial_plan(plan_factory, output, input_) # While resource consumption of GlobalFFTKernel can be made lower by passing # lower value to prepare_for(), LocalFFTKernel may have to be split into several kernels. # Therefore, if GlobalFFTKernel.prepare_for() raises OutOfResourcesError, # we just call prepare_for() with lower limit, but if LocalFFTKernel.prepare_for() # does that, we have to recreate the whole chain. local_kernel_limit = device_params.max_work_group_size while local_kernel_limit >= 1: try: plan = self._build_limited_plan(plan_factory, device_params, local_kernel_limit, output, input_, inverse) except LocalKernelFail: # One of LocalFFTKernels was out of resources. # Reduce the limit and try to create operations from scratch again. local_kernel_limit //= 2 continue except GlobalKernelFail: raise ValueError( "Could not find suitable call parameters for one of the global kernels" ) return plan raise ValueError( "Could not find suitable call parameters for one of the local kernels" )
def __call__(self, *xs): """ Evaluate function in coordinate space for given grid. """ if len(xs) > 1: xxs = numpy.meshgrid(*xs, indexing="ij") else: xxs = xs res_shape = ((self.batch, ) if self.batch is not None else tuple()) + xxs[0].shape res = numpy.zeros(res_shape, self.dtype) for coeff, coord in self.modes: if self.batch is not None: b = coord[0] coord = coord[1:] target = res[b] else: target = res target += coeff * product( [self.harmonics[m](xx) for m, xx in zip(coord, xxs)]) return res**self.order
def _build_plan(self, plan_factory, device_params, output, input_, inverse): if helpers.product([input_.shape[i] for i in self._axes]) == 1: return self._build_trivial_plan(plan_factory, output, input_) plan = plan_factory() axes = tuple(sorted(self._axes)) shape = list(input_.shape) if all(shape[axis] % 2 == 0 for axis in axes): # If all shift axes have even length, it is possible to perform the shift inplace # (by swapping pairs of elements). # Note that the inplace fftshift is its own inverse. shape[axes[0]] //= 2 plan.kernel_call(TEMPLATE.get_def('fftshift_inplace'), [output, input_], kernel_name="kernel_fftshift_inplace", global_size=shape, render_kwds=dict(axes=axes)) else: # Resort to an out-of-place shift to a temporary array and then copy. temp = plan.temp_array_like(output) plan.kernel_call(TEMPLATE.get_def('fftshift_outplace'), [temp, input_, inverse], kernel_name="kernel_fftshift_outplace", global_size=shape, render_kwds=dict(axes=axes)) copy_trf = copy(input_, out_arr_t=output) copy_comp = PureParallel.from_trf(copy_trf, copy_trf.input) plan.computation_call(copy_comp, output, temp) return plan
def pytest_generate_tests(metafunc): errors_shapes_and_axes = [ ((10, ), (0, )), ((11, ), (0, )), ((9000, ), (0, )), ((9001, ), (0, )), ((128, 60), (0, 1)), ((127, 60), (0, 1)), ((127, 61), (0, 1)), ((100, 80, 60), (0, 1, 2)), ((101, 80, 61), (0, 1, 2)), ((101, 80, 61), (0, 2)), ((20, 31, 80, 61), (0, 2)), ] perf_shapes = [ (2**4, ), # 1D, small size (2**18, ), # 1D, large size (2**4, 2**4), # 2D, small size (2**9, 2**9), # 2D, large size ] perf_even_shapes_and_axes = [] perf_odd_shapes_and_axes = [] mem_limit = 2**22 for contigous in (True, False): for shape in perf_shapes: batch = mem_limit // product(shape) if contigous: full_shape = (batch, ) + shape axes = tuple(range(1, len(shape) + 1)) else: full_shape = shape + (batch, ) axes = tuple(range(0, len(shape))) perf_even_shapes_and_axes.append((full_shape, axes)) full_shape = list(full_shape) for axis in axes: full_shape[axis] -= 1 perf_odd_shapes_and_axes.append((tuple(full_shape), axes)) idgen = lambda pair: str(pair[0]) + '_over_' + str(pair[1]) if 'errors_shape_and_axes' in metafunc.funcargnames: metafunc.parametrize('errors_shape_and_axes', errors_shapes_and_axes, ids=list(map(idgen, errors_shapes_and_axes))) elif 'perf_even_shape_and_axes' in metafunc.funcargnames: metafunc.parametrize('perf_even_shape_and_axes', perf_even_shapes_and_axes, ids=list(map(idgen, perf_even_shapes_and_axes))) elif 'perf_odd_shape_and_axes' in metafunc.funcargnames: metafunc.parametrize('perf_odd_shape_and_axes', perf_odd_shapes_and_axes, ids=list(map(idgen, perf_odd_shapes_and_axes)))
def prepare_for(self, max_local_size): kwds = dict(self._constant_kwds) radix_arr, radix1_arr, radix2_arr = get_global_radix_info(self._fft_size) radix = radix_arr[self._pass_num] radix1 = radix1_arr[self._pass_num] radix2 = radix2_arr[self._pass_num] stride_out = self._inner_batch * helpers.product(radix_arr[:self._pass_num]) stride = stride_out * radix stride_in = stride_out * helpers.product(radix_arr[self._pass_num+1:]) threads_per_xform = radix2 coalesce_width = kwds['min_mem_coalesce_width'] local_batch = max_local_size if radix2 == 1 else coalesce_width local_batch = min(local_batch, stride_in) local_size = min(local_batch * threads_per_xform, max_local_size) local_batch = local_size // threads_per_xform workgroups_num = helpers.min_blocks(stride_in, local_batch) * self._outer_batch if radix2 == 1: lmem_size = 0 else: if stride_out == 1: lmem_size = (radix + 1) * local_batch else: lmem_size = local_size * radix1 if lmem_size * self._itemsize // 2 > self._local_mem_size: raise OutOfResourcesError kwds.update(self._constant_kwds) kwds.update(dict( fft_size=self._fft_size, curr_size=self._curr_size, fft_size_real=self._fft_size_real, pass_num=self._pass_num, lmem_size=lmem_size, local_batch=local_batch, local_size=local_size, inner_batch=self._inner_batch, radix_arr=radix_arr, radix1_arr=radix1_arr, radix2_arr=radix2_arr, radix1=radix1, radix2=radix2, radix=radix, stride_in=stride_in, stride_out=stride_out, stride=stride, last_pass=self._last_pass)) return workgroups_num * local_size, local_size, kwds
def test_large_scan_performance(thr, large_perf_shape, exclusive): """ Large problem sizes. """ dtype = dtypes.normalize_type(numpy.int64) min_time = check_scan( thr, large_perf_shape, dtype=dtype, axes=None, exclusive=exclusive, measure_time=True) return min_time, helpers.product(large_perf_shape) * dtype.itemsize
def test_find_bounding_shape(virtual_size, available_shape): """ Tests that ``find_bounding_shape()`` obeys its contracts. """ shape = vsize.find_bounding_shape(virtual_size, available_shape) assert all(isinstance(d, int) for d in shape) assert product(shape) >= virtual_size assert all(d <= ad for d, ad in zip(shape, available_shape))
def test_group_dimensions(virtual_shape, available_shape): """ Tests that ``group_dimensions()`` obeys its contracts. """ v_groups, a_groups = vsize.group_dimensions(virtual_shape, available_shape) v_dims = [] a_dims = [] for v_group, a_group in zip(v_groups, a_groups): v_shape = virtual_shape[v_group[0]:v_group[-1]+1] a_shape = available_shape[a_group[0]:a_group[-1]+1] assert(product(v_shape) <= product(a_shape)) v_dims += v_group a_dims += a_group assert v_dims == list(range(len(virtual_shape))) assert a_dims == list(range(len(available_shape[:len(a_dims)])))
def test_find_bounding_shape(virtual_size, available_shape): """ Tests that ``find_bounding_shape()`` obeys its contracts. """ shape = vsize.find_bounding_shape(virtual_size, available_shape) assert all(isinstance(d, int) for d in shape) assert product(shape) >= virtual_size assert all(d <= ad for d, ad in zip(shape, available_shape))
def test_group_dimensions(virtual_shape, available_shape): """ Tests that ``group_dimensions()`` obeys its contracts. """ v_groups, a_groups = vsize.group_dimensions(virtual_shape, available_shape) v_dims = [] a_dims = [] for v_group, a_group in zip(v_groups, a_groups): v_shape = virtual_shape[v_group[0]:v_group[-1] + 1] a_shape = available_shape[a_group[0]:a_group[-1] + 1] assert (product(v_shape) <= product(a_shape)) v_dims += v_group a_dims += a_group assert v_dims == list(range(len(virtual_shape))) assert a_dims == list(range(len(available_shape[:len(a_dims)])))
def _build_plan(self, plan_factory, device_params, output, input_): plan = plan_factory() N = input_.shape[-1] * 2 batch_shape = input_.shape[:-1] batch_size = helpers.product(batch_shape) coeffs1 = 4 * numpy.sin(2 * numpy.pi * numpy.arange(N // 2) / N) coeffs2 = 2 * numpy.cos(2 * numpy.pi * numpy.arange(N // 2) / N) c1_arr = plan.persistent_array(coeffs1) c2_arr = plan.persistent_array(coeffs2) multiply = get_multiply(input_) # re_X_1 = sum(x * coeffs2) t = plan.temp_array_like(input_) rd = Reduce(t, predicate_sum(input_.dtype), axes=(len(input_.shape) - 1, )) rd.parameter.input.connect(multiply, multiply.output, x=multiply.a, c2=multiply.b) re_X_0 = plan.temp_array_like(rd.parameter.output) plan.computation_call(rd, re_X_0, input_, c2_arr) # Y = numpy.fft.rfft(x * coeffs1) rfft = RFFT(input_, dont_store_last=True) rfft.parameter.input.connect(multiply, multiply.output, x=multiply.a, c1=multiply.b) Y = plan.temp_array_like(rfft.parameter.output) plan.computation_call(rfft, Y, input_, c1_arr) # Y *= -1j # Y[0] /= 2 # Y[0] += re_X_1 # res = numpy.cumsum(Y[:-1]) prepare_prfft_scan = get_prepare_prfft_scan(Y) sc = Scan(Y, predicate_sum(Y.dtype), axes=(-1, ), exclusive=False) sc.parameter.input.connect(prepare_prfft_scan, prepare_prfft_scan.output, Y=prepare_prfft_scan.Y, re_X_0=prepare_prfft_scan.re_X_0) plan.computation_call(sc, output, Y, re_X_0) return plan
def _build_plan( self, plan_factory, device_params, result_a, result_cv, key, noises1, noises2): plan = plan_factory() polynomial_degree = self._polynomial_degree batch_shape = result_a.shape[:-2] batch_len = helpers.product(batch_shape) perf_params = self._perf_params transform = get_transform(self._transform_type) ft_key = transform.ForwardTransform(key.shape[:-1], polynomial_degree, perf_params) key_tr = plan.temp_array_like(ft_key.parameter.output) ft_noises = transform.ForwardTransform(noises1.shape[:-1], polynomial_degree, perf_params) noises1_tr = plan.temp_array_like(ft_noises.parameter.output) ift = transform.InverseTransform(noises1.shape[:-1], polynomial_degree, perf_params) ift_res = plan.temp_array_like(ift.parameter.output) mul_tr = Transformation( [ Parameter('output', Annotation(ift.parameter.input, 'o')), Parameter('key', Annotation(key_tr, 'i')), Parameter('noises1', Annotation(noises1_tr, 'i')) ], """ ${output.store_same}(${tr_ctype}unpack(${mul}( ${tr_ctype}pack(${key.load_idx}(${idxs[-2]}, ${idxs[-1]})), ${tr_ctype}pack(${noises1.load_same}) ))); """, connectors=['output', 'noises1'], render_kwds=dict( mul=transform.transformed_mul(perf_params), tr_ctype=transform.transformed_internal_ctype())) ift.parameter.input.connect(mul_tr, mul_tr.output, key=mul_tr.key, noises1=mul_tr.noises1) plan.computation_call(ft_key, key_tr, key) plan.computation_call(ft_noises, noises1_tr, noises1) plan.computation_call(ift, ift_res, key_tr, noises1_tr) plan.kernel_call( TEMPLATE.get_def("tlwe_encrypt_zero_fill_result"), [result_a, result_cv, noises1, noises2, ift_res], kernel_name="tlwe_encrypt_zero_fill_result", global_size=(batch_len, self._mask_size + 1, polynomial_degree), render_kwds=dict( noise=self._noise, mask_size=self._mask_size, noises1_slices=(len(batch_shape), 1, 1), noises2_slices=(len(batch_shape), 1), cv_slices=(len(batch_shape),) )) return plan
def _build_plan(self, plan_factory, _device_params, counters, randoms): plan = plan_factory() plan.kernel_call( TEMPLATE.get_def('cbrng'), [counters, randoms], global_size=helpers.product(counters.shape), render_kwds=dict( sampler=self._sampler, keygen=self._keygen, batch=helpers.product(randoms.shape[:-self._generators_dim]), counters_slices=[self._generators_dim], randoms_slices=[ len(randoms.shape) - self._generators_dim, self._generators_dim])) return plan
def __init__(self, thr, template_src, name, global_size, local_size=None, render_args=None, render_kwds=None, fast_math=False, compiler_options=None, constant_arrays=None, keep=False): """__init__()""" # hide the signature from Sphinx self._thr = thr if render_args is None: render_args = [] if render_kwds is None: render_kwds = {} main_src = render_template_source( template_src, render_args=render_args, render_kwds=render_kwds) # Since virtual size function require some registers, they affect the maximum local size. # Start from the device's max work group size as the first approximation # and recompile kernels with smaller local sizes until convergence. max_local_size = thr.device_params.max_work_group_size while True: # Try to find kernel launch parameters for the requested local size. # May raise OutOfResourcesError if it's not possible, # just let it pass to the caller. vs = VirtualSizes( thr.device_params, global_size, virtual_local_size=local_size, max_local_size=max_local_size) # Try to compile the kernel with the corresponding virtual size functions program = Program( self._thr, vs.vsize_functions + main_src, static=True, fast_math=fast_math, compiler_options=compiler_options, constant_arrays=constant_arrays, keep=keep) kernel = getattr(program, name) if kernel.max_work_group_size >= product(vs.real_local_size): # Kernel will execute with this local size, use it break # By the contract of VirtualSizes, # product(vs.real_local_size) <= max_local_size # Also, since we're still in this loop, # kernel.max_work_group_size < product(vs.real_local_size). # Therefore the new max_local_size value is guaranteed # to be smaller than the previous one. max_local_size = kernel.max_work_group_size self._program = program self._kernel = kernel self.virtual_local_size = vs.virtual_local_size self.virtual_global_size = vs.virtual_global_size self.local_size = vs.real_local_size self.global_size = vs.real_global_size self._kernel.prepare(self.global_size, local_size=self.local_size)
def _build_plan(self, plan_factory, _device_params, counters, randoms): plan = plan_factory() plan.kernel_call(TEMPLATE.get_def('cbrng'), [counters, randoms], global_size=helpers.product(counters.shape), render_kwds=dict( sampler=self._sampler, keygen=self._keygen, batch=helpers.product( randoms.shape[:-self._generators_dim]), counters_slices=[self._generators_dim], randoms_slices=[ len(randoms.shape) - self._generators_dim, self._generators_dim ])) return plan
def generate_modes(mshape, dtype, batch=None, random=True): """ Generates list of sparse modes for the problem of given shape. """ max_modes_per_batch = 20 modelist = [] if product(mshape) <= max_modes_per_batch: # If there are not many modes, fill all of them modenums = itertools.product(*[range(modes) for modes in mshape]) if batch is not None: for b in range(batch): modelist += [((b,) + modenum) for modenum in modenums] else: modelist += list(modenums) else: # If there are many modes, fill some random ones rand_coord = lambda: tuple( numpy.random.randint(0, mshape[i]) for i in range(len(mshape))) if batch is not None: for b in range(batch): for i in range(max_modes_per_batch): modelist.append((b,) + rand_coord()) else: for i in range(max_modes_per_batch): modelist.append(rand_coord()) # add corner modes, to make sure extreme cases are still processed correctly corner_modes = itertools.product(*[(0, mshape[i]-1) for i in range(len(mshape))]) for modenum in corner_modes: if batch is not None: for b in range(batch): modelist.append((b,) + modenum) else: modelist.append(modenum) modelist = set(modelist) # remove duplicates # Assign coefficients modes = [] for coord in modelist: get_coeff = lambda: numpy.random.normal() if random else 1 if dtypes.is_complex(dtype): coeff = get_coeff() + 1j * get_coeff() else: coeff = get_coeff() coeff = dtype(coeff) # scaling coefficients for higher modes because of the lower precision in this case modenums = coord if batch is None else coord[1:] coeff /= sum(modenums) + 1 modes.append((coeff, coord)) return modes
def __init__(self, shape, box, drift, trajectories=1, diffusion=None): if diffusion is not None: assert diffusion.dtype == drift.dtype assert diffusion.components == drift.components if not diffusion.real_noise or dtypes.is_real(drift.dtype): noise_dtype = drift.dtype else: noise_dtype = dtypes.real_for(drift.dtype) self.noise_type = Type(noise_dtype, (trajectories, diffusion.noise_sources) + shape) self.noise = True cell_volume = product(box) / product(shape) self._noise_normalization = 1. / cell_volume else: self.noise_type = None self.noise = False
def test_small_scan_performance(thr, exclusive, seq_size): """ Small problem sizes, big batches. """ dtype = dtypes.normalize_type(numpy.complex128) shape = (500, 2, 2, 512) min_time = check_scan( thr, shape, dtype=dtype, axes=(-1,), exclusive=exclusive, measure_time=True, seq_size=seq_size) return min_time, helpers.product(shape) * dtype.itemsize
def _group_dimensions(vdim, virtual_shape, adim, available_shape): """ ``vdim`` and ``adim`` are used for the absolute addressing of dimensions during recursive calls. """ if len(virtual_shape) == 1 and virtual_shape[0] == 1: return [(vdim,)], [(adim,)] if len(virtual_shape) == 0: return [], [] if virtual_shape[0] == 1: v_remainder, a_remainder = _group_dimensions( vdim + 1, virtual_shape[1:], adim, available_shape) return [(vdim,) + v_remainder[0]] + v_remainder[1:], a_remainder vdim_group = 1 # number of currently grouped virtual dimensions adim_group = 1 # number of currently grouped available dimensions while 1: # If we have more elements in the virtual group than there is in the available group, # extend the available group by one dimension. if product(virtual_shape[:vdim_group]) > product(available_shape[:adim_group]): adim_group += 1 continue # If the remaining available dimensions cannot accommodate the remaining virtual dimensions, # we try to fit one more virtual dimension in the virtual group. if product(virtual_shape[vdim_group:]) > product(available_shape[adim_group:]): vdim_group += 1 continue # If we are here, it means that: # 1) the current available group can accommodate the current virtual group; # 2) the remaining available dimensions can accommodate the remaining virtual dimensions. # This means we can make a recursive call now. v_res = tuple(range(vdim, vdim + vdim_group)) a_res = tuple(range(adim, adim + adim_group)) v_remainder, a_remainder = _group_dimensions( vdim + vdim_group, virtual_shape[vdim_group:], adim + adim_group, available_shape[adim_group:]) return [v_res] + v_remainder, [a_res] + a_remainder
def __init__(self, virtual_shape, available_shape): self.real_dims = {} self.real_strides = {} self.virtual_strides = {} self.major_vdims = {} self.bounding_shape = tuple() self.skip_thresholds = [] v_groups, a_groups = group_dimensions(virtual_shape, available_shape) for v_group, a_group in zip(v_groups, a_groups): virtual_subshape = virtual_shape[v_group[0]:v_group[-1]+1] virtual_subsize = product(virtual_subshape) bounding_subshape = find_bounding_shape( virtual_subsize, available_shape[a_group[0]:a_group[-1]+1]) self.bounding_shape += bounding_subshape if virtual_subsize < product(bounding_subshape): strides = [(adim, product(bounding_subshape[:i])) for i, adim in enumerate(a_group)] self.skip_thresholds.append((virtual_subsize, strides)) for vdim in v_group: self.real_dims[vdim] = a_group self.real_strides[vdim] = tuple( product(self.bounding_shape[a_group[0]:adim]) for adim in a_group) self.virtual_strides[vdim] = product(virtual_shape[v_group[0]:vdim]) # The major virtual dimension (the one that does not require # modulus operation when extracting its index from the flat index) # is the last non-trivial one (not of size 1). # Modulus will not be optimized away by the compiler, # but we know that all threads outside of the virtual group will be # filtered out by VIRTUAL_SKIP_THREADS. for major_vdim in _range(len(v_group) - 1, -1, -1): if virtual_shape[v_group[major_vdim]] > 1: break self.major_vdims[vdim] = v_group[major_vdim]
def _group_dimensions(vdim, virtual_shape, adim, available_shape): """ ``vdim`` and ``adim`` are used for the absolute addressing of dimensions during recursive calls. """ if len(virtual_shape) == 0: return [], [] vdim_group = 1 # number of currently grouped virtual dimensions adim_group = 1 # number of currently grouped available dimensions while 1: # If we have more elements in the virtual group than there is in the available group, # extend the available group by one dimension. if product(virtual_shape[:vdim_group]) > product(available_shape[:adim_group]): adim_group += 1 continue # If the remaining available dimensions cannot accommodate the remaining virtual dimensions, # we try to fit one more virtual dimension in the virtual group. if product(virtual_shape[vdim_group:]) > product(available_shape[adim_group:]): vdim_group += 1 continue # If we are here, it means that: # 1) the current available group can accommodate the current virtual group; # 2) the remaining available dimensions can accommodate the remaining virtual dimensions. # This means we can make a recursive call now. # Attach any following trivial virtual dimensions (of size 1) to this group # This will help to avoid unassigned trivial dimensions with no real dimensions left. while vdim_group < len(virtual_shape) and virtual_shape[vdim_group] == 1: vdim_group += 1 v_res = tuple(range(vdim, vdim + vdim_group)) a_res = tuple(range(adim, adim + adim_group)) v_remainder, a_remainder = _group_dimensions( vdim + vdim_group, virtual_shape[vdim_group:], adim + adim_group, available_shape[adim_group:]) return [v_res] + v_remainder, [a_res] + a_remainder
def try_create(cls, global_size, local_size, max_num_groups, max_work_item_sizes): """ This method is used to filter working combinations of parameters from the cartesian product of all possible ones. Returns ``None`` if the parameters are not compatible. """ if len(max_num_groups) != len(max_work_item_sizes): return None if local_size is not None: if len(local_size) > len(global_size): return None else: # we need local size and global size of the same length local_size = local_size + (1,) * (len(global_size) - len(local_size)) if product(local_size) > product(max_work_item_sizes): return None bounding_global_size = [ ls * min_blocks(gs, ls) for gs, ls in zip(global_size, local_size)] if product(bounding_global_size) > product(max_num_groups): return None else: if product(global_size) > product(max_num_groups): return None return cls(global_size, local_size, max_num_groups, max_work_item_sizes)