def _construct_operations(self, basis, device_params): if product([basis.shape[i] for i in basis.axes]) == 1: # Trivial problem. Need to add a dummy kernel # because we still have to run transformations. operations = self._get_operation_recorder() identity = self.get_nested_computation( specialize_elementwise('output', 'input', 'direction', dict(kernel="${output.store}(idx, ${input.load}(idx));"))) operations.add_computation(identity, 'output', 'input', 'direction') return operations # While resource consumption of GlobalFFTKernel can be made lower by passing # lower value to prepare_for(), LocalFFTKernel may have to be split into several kernels. # Therefore, if GlobalFFTKernel.prepare_for() raises OutOfResourcesError, # we just call prepare_for() with lower limit, but if LocalFFTKernel.prepare_for() # does that, we have to recreate the whole chain. local_kernel_limit = device_params.max_work_group_size kernel_calls = [] while local_kernel_limit >= 1: # Starting from scratch. operations = self._get_operation_recorder() kernels = get_fft_kernels(basis, device_params, local_kernel_limit) for i, kernel in enumerate(kernels): mem_in = 'input' if i == 0 else mem_out if i == len(kernels) - 1: mem_out = 'output' else: mem_out = operations.add_allocation(kernel.output_shape, basis.dtype) if kernel.kweights is not None: kweights = operations.add_const_allocation( kernel.kweights.astype(basis.dtype)) kweights_arg = [kweights] else: kweights_arg = [] argnames = [mem_out, mem_in] + kweights_arg + ['direction'] # Try to find local size for each of the kernels local_size = device_params.max_work_group_size local_kernel_fail = False # marks the event when LocalFFTKernel is out of resources while local_size >= 1 and not local_kernel_fail: try: gs, ls, kwds = kernel.prepare_for(local_size) operations.add_kernel( TEMPLATE, kernel.name, argnames, global_size=gs, local_size=ls, render_kwds=kwds, inplace=([(mem_out, mem_in)] if kernel.inplace_possible else None)) except OutOfResourcesError: if isinstance(kernel, GlobalFFTKernel): local_size //= 2 else: local_kernel_fail = True continue kernel_calls.append((kernel.name, argnames, gs, ls, kwds)) break else: if not local_kernel_fail: raise ValueError( "Could not find suitable call parameters for one of the global kernels") if local_kernel_fail: break else: # everything went well, returning list of calls return operations # The cycle above received 'break', meaning that LocalFFTKernel was out of resources. # Reduce the limit and try to create operations from scratch again. local_kernel_limit //= 2 else: raise ValueError("Could not find suitable call parameters for one of the local kernels")
import tigger.transformations as tr from helpers import * def pytest_generate_tests(metafunc): int_dtypes = [numpy.dtype('int32'), numpy.dtype('int64')] float_dtypes = [numpy.dtype('float32')] complex_dtypes = [numpy.dtype('complex64')] if 'any_dtype' in metafunc.funcargnames: dtypes = int_dtypes + float_dtypes + complex_dtypes metafunc.parametrize('any_dtype', dtypes, ids=[str(x) for x in dtypes]) TestComputation = specialize_elementwise('output', 'input', None, dict(kernel="${output.store}(idx, ${input.load}(idx));")) def test_identity(some_ctx, any_dtype): input = get_test_array((1000,), any_dtype) input_dev = some_ctx.to_device(input) output_dev = some_ctx.empty_like(input_dev) test = TestComputation(some_ctx) test.connect(tr.identity(), 'input', ['input_prime']) test.connect(tr.identity(), 'output', ['output_prime']) test.prepare_for(output_dev, input_dev) test(output_dev, input_dev) assert diff_is_negligible(output_dev.get(), input)
from helpers import * def pytest_generate_tests(metafunc): int_dtypes = [numpy.dtype('int32'), numpy.dtype('int64')] float_dtypes = [numpy.dtype('float32')] complex_dtypes = [numpy.dtype('complex64')] if 'any_dtype' in metafunc.funcargnames: dtypes = int_dtypes + float_dtypes + complex_dtypes metafunc.parametrize('any_dtype', dtypes, ids=[str(x) for x in dtypes]) TestComputation = specialize_elementwise( 'output', 'input', None, dict(kernel="${output.store}(idx, ${input.load}(idx));")) def test_identity(some_ctx, any_dtype): input = get_test_array((1000, ), any_dtype) input_dev = some_ctx.to_device(input) output_dev = some_ctx.empty_like(input_dev) test = TestComputation(some_ctx) test.connect(tr.identity(), 'input', ['input_prime']) test.connect(tr.identity(), 'output', ['output_prime']) test.prepare_for(output_dev, input_dev) test(output_dev, input_dev)
def _construct_operations(self, basis, device_params): if product([basis.shape[i] for i in basis.axes]) == 1: # Trivial problem. Need to add a dummy kernel # because we still have to run transformations. operations = self._get_operation_recorder() identity = self.get_nested_computation( specialize_elementwise( 'output', 'input', 'direction', dict(kernel="${output.store}(idx, ${input.load}(idx));"))) operations.add_computation(identity, 'output', 'input', 'direction') return operations # While resource consumption of GlobalFFTKernel can be made lower by passing # lower value to prepare_for(), LocalFFTKernel may have to be split into several kernels. # Therefore, if GlobalFFTKernel.prepare_for() raises OutOfResourcesError, # we just call prepare_for() with lower limit, but if LocalFFTKernel.prepare_for() # does that, we have to recreate the whole chain. local_kernel_limit = device_params.max_work_group_size kernel_calls = [] while local_kernel_limit >= 1: # Starting from scratch. operations = self._get_operation_recorder() kernels = get_fft_kernels(basis, device_params, local_kernel_limit) for i, kernel in enumerate(kernels): mem_in = 'input' if i == 0 else mem_out if i == len(kernels) - 1: mem_out = 'output' else: mem_out = operations.add_allocation( kernel.output_shape, basis.dtype) if kernel.kweights is not None: kweights = operations.add_const_allocation( kernel.kweights.astype(basis.dtype)) kweights_arg = [kweights] else: kweights_arg = [] argnames = [mem_out, mem_in] + kweights_arg + ['direction'] # Try to find local size for each of the kernels local_size = device_params.max_work_group_size local_kernel_fail = False # marks the event when LocalFFTKernel is out of resources while local_size >= 1 and not local_kernel_fail: try: gs, ls, kwds = kernel.prepare_for(local_size) operations.add_kernel( TEMPLATE, kernel.name, argnames, global_size=gs, local_size=ls, render_kwds=kwds, inplace=([(mem_out, mem_in)] if kernel.inplace_possible else None)) except OutOfResourcesError: if isinstance(kernel, GlobalFFTKernel): local_size //= 2 else: local_kernel_fail = True continue kernel_calls.append((kernel.name, argnames, gs, ls, kwds)) break else: if not local_kernel_fail: raise ValueError( "Could not find suitable call parameters for one of the global kernels" ) if local_kernel_fail: break else: # everything went well, returning list of calls return operations # The cycle above received 'break', meaning that LocalFFTKernel was out of resources. # Reduce the limit and try to create operations from scratch again. local_kernel_limit //= 2 else: raise ValueError( "Could not find suitable call parameters for one of the local kernels" )