def test_ntt_lsh_method_performance(thread, single_kernel_bootstrap, heavy_performance_load, ntt_lsh_method): if thread.api.get_id() != cuda_id() and ntt_lsh_method == 'cuda_asm': pytest.skip() size = 4096 if heavy_performance_load else 64 rng = numpy.random.RandomState() secret_key, cloud_key = make_key_pair(thread, rng, transform_type='NTT') # TODO: instead of creating a whole key and then checking if the parameters are supported, # we can just create a parameter object separately. if (single_kernel_bootstrap and not single_kernel_bootstrap_supported( secret_key.params, thread.device_params)): pytest.skip() perf_params = PerformanceParameters( secret_key.params, single_kernel_bootstrap=single_kernel_bootstrap, ntt_lsh_method=ntt_lsh_method) results = check_performance(thread, (secret_key, cloud_key), perf_params, shape=size) print() print(check_performance_str(results))
def __init__(self, api, pnum, dnum): self.api_id = api.get_id() self.pnum = pnum self.dnum = dnum self._api = api platform = api.get_platforms()[pnum] self._device = platform.get_devices()[dnum] platform_name = platform.name device_name = self._device.name self.device_params = api.DeviceParameters(self._device) self.device_id = "{api},{pnum},{dnum}".format(api=api.get_id(), pnum=pnum, dnum=dnum) self.device_full_name = platform_name + ", " + device_name self.id = self.device_id # if we import it in the header, it messes up with coverage results import reikna.cluda as cluda self.cuda = (api.get_id() == cluda.cuda_id())
def check_func_performance(tag, thread, func_module, reference_func, output_type, input_types, ranges=None, heavy_performance_load=False): N = 1024 * (1024 if heavy_performance_load else 32) repetitions = 100000 test = get_func_kernel(thread, func_module, output_type, input_types) perf_test = get_func_kernel(thread, func_module, output_type, input_types, repetitions=repetitions) arrays = [ get_test_array(N, tp, val_range=ranges[i] if ranges is not None else None) for i, tp in enumerate(input_types) ] arrays_dev = [thread.to_device(arr) for arr in arrays] dest_dev = thread.array(N, tp_dtype(output_type)) # Sanity check test(dest_dev, *arrays_dev, global_size=N) assert (dest_dev.get() == reference_func(*arrays)).all() # Performance check times = [] for j in range(10): thread.synchronize() t1 = time.time() perf_test(dest_dev, *arrays_dev, global_size=N) thread.synchronize() t2 = time.time() times.append(t2 - t1) times = numpy.array(times) times /= repetitions times /= N times *= 1e12 print() print( "{backend}: {tag} --- min: {min:.4f}, mean: {mean:.4f}, std: {std:.4f}" .format(tag=tag, min=times.min(), mean=times.mean(), std=times.std(), backend='cuda' if thread.api.get_id() == cluda.cuda_id() else 'ocl '))
def test_sub_perf(thread, method): if method == "cuda_asm" and thread.api.get_id() != cluda.cuda_id(): pytest.skip() if method == "c_from_asm": pytest.skip() check_func_performance("sub(), " + method, thread, ntt.sub(method=method), ref_sub, 'ff_number', ['ff_number', 'ff_number'])
def test_mul(thread, method): if method == "cuda_asm" and thread.api.get_id() != cluda.cuda_id(): pytest.skip() check_func( thread, ntt.mul(method=method), ref_mul, 'ff_number', ['ff_number', 'ff_number'], test_values=[ (ntt_cpu.GaloisNumber.modulus - 1, 2**33) # regression test for an error in method=c ] )
def test_mod(thread, method): if method == "cuda_asm" and thread.api.get_id() != cluda.cuda_id(): pytest.skip() check_func( thread, ntt.mod(method=method), ref_mod, 'ff_number', [numpy.uint64], test_values=[ (ntt_cpu.GaloisNumber.modulus - 1,), (ntt_cpu.GaloisNumber.modulus,), (ntt_cpu.GaloisNumber.modulus + 1,)])
def test_lsh(thread, exp_range, method): if method == "cuda_asm" and thread.api.get_id() != cluda.cuda_id(): pytest.skip() exp_dtype = numpy.uint32 check_func(thread, ntt.lsh(exp_range, exp_dtype, method=method), ref_lsh, 'ff_number', ['ff_number', exp_dtype], ranges=[None, (exp_range - 32, exp_range)], test_values=[(11509900421665959066, exp_range - 1)])
def test_add_perf(thread, method, heavy_performance_load): if method == "cuda_asm" and thread.api.get_id() != cluda.cuda_id(): pytest.skip() if method == "c_from_asm": pytest.skip() check_func_performance( "add(), " + method, thread, ntt.add(method=method), ref_add, 'ff_number', ['ff_number', 'ff_number'], heavy_performance_load=heavy_performance_load)
def test_constant_memory(thr): global_size = 100 arr1 = get_test_array(global_size, numpy.int32) arr2 = get_test_array(global_size * 2, numpy.float32) ref = ((arr2[:global_size] + arr2[global_size:]) * arr1).astype(numpy.float32) src = """ KERNEL void test(GLOBAL_MEM float *dest) { const SIZE_T i = get_global_id(0); dest[i] = (arr2[i] + arr2[i + get_global_size(0)]) * arr1[i]; } """ constant_arrays = dict( arr1=arr1, # use the parameters of an existing array; the array itself is not copied! arr2=(global_size * 2, numpy.float32) ) # Compiled-in constant arrays only supported on CUDA if thr.api.get_id() != cluda.cuda_id(): with pytest.raises(ValueError): program = thr.compile(src, constant_arrays=constant_arrays) return program = thr.compile(src, constant_arrays=constant_arrays) test = program.test # TODO: program.constant.arr1.set(arr1) program.set_constant('arr1', arr1) program.set_constant('arr2', arr2) dest_dev = thr.array(global_size, numpy.float32) test(dest_dev, global_size=global_size) assert diff_is_negligible(dest_dev.get(), ref) # Use a program/kernel with different constant arrays arr1_2 = get_test_array(global_size, numpy.int32) arr2_2 = get_test_array(global_size * 2, numpy.float32) ref_2 = ((arr2_2[:global_size] + arr2_2[global_size:]) * arr1_2).astype(numpy.float32) program2 = thr.compile(src, constant_arrays=constant_arrays) test2 = program2.test program2.set_constant('arr1', arr1_2) program2.set_constant('arr2', arr2_2) test2(dest_dev, global_size=global_size) assert diff_is_negligible(dest_dev.get(), ref_2) # Check that the first kernel preserved the data in its constant arrays test(dest_dev, global_size=global_size) assert diff_is_negligible(dest_dev.get(), ref)
def __init__(self, api_id, platform_id, device_id): self.api_id = api_id self.platform_id = platform_id self.device_id = device_id # We do not save the actual device_obj because this object # can be passed to another thread/process. api, platform, device = self._get_objects() self.platform_name = platform.name self.device_name = device.name self.api_name = "CUDA" if api.get_id() == cluda.cuda_id() else "OpenCL"
def test_constant_memory(thr): global_size = 100 arr1 = get_test_array(global_size, numpy.int32) arr2 = get_test_array(global_size * 2, numpy.float32) ref = ((arr2[:global_size] + arr2[global_size:]) * arr1).astype( numpy.float32) src = """ KERNEL void test(GLOBAL_MEM float *dest) { const SIZE_T i = get_global_id(0); dest[i] = (arr2[i] + arr2[i + get_global_size(0)]) * arr1[i]; } """ constant_arrays = dict( arr1= arr1, # use the parameters of an existing array; the array itself is not copied! arr2=(global_size * 2, numpy.float32)) # Compiled-in constant arrays only supported on CUDA if thr.api.get_id() != cluda.cuda_id(): with pytest.raises(ValueError): program = thr.compile(src, constant_arrays=constant_arrays) return program = thr.compile(src, constant_arrays=constant_arrays) test = program.test # TODO: program.constant.arr1.set(arr1) program.set_constant('arr1', arr1) program.set_constant('arr2', arr2) dest_dev = thr.array(global_size, numpy.float32) test(dest_dev, global_size=global_size) assert diff_is_negligible(dest_dev.get(), ref) # Use a program/kernel with different constant arrays arr1_2 = get_test_array(global_size, numpy.int32) arr2_2 = get_test_array(global_size * 2, numpy.float32) ref_2 = ((arr2_2[:global_size] + arr2_2[global_size:]) * arr1_2).astype( numpy.float32) program2 = thr.compile(src, constant_arrays=constant_arrays) test2 = program2.test program2.set_constant('arr1', arr1_2) program2.set_constant('arr2', arr2_2) test2(dest_dev, global_size=global_size) assert diff_is_negligible(dest_dev.get(), ref_2) # Check that the first kernel preserved the data in its constant arrays test(dest_dev, global_size=global_size) assert diff_is_negligible(dest_dev.get(), ref)
def test_constant_memory_static_kernel(thr): global_size = 100 arr1 = get_test_array(global_size, numpy.int32) arr2 = get_test_array(global_size * 2, numpy.float32) ref = ((arr2[:global_size] + arr2[global_size:]) * arr1).astype( numpy.float32) src = """ KERNEL void test(GLOBAL_MEM float *dest) { const SIZE_T i = get_global_id(0); dest[i] = (arr2[i] + arr2[i + get_global_size(0)]) * arr1[i]; } """ constant_arrays = dict( arr1= arr1, # use the parameters of an existing array; the array itself is not copied! arr2=(global_size * 2, numpy.float32)) # Compiled-in constant arrays only supported on CUDA if thr.api.get_id() != cluda.cuda_id(): with pytest.raises(ValueError): program = thr.compile_static(src, "test", global_size=global_size, constant_arrays=constant_arrays) return test = thr.compile_static(src, "test", global_size=global_size, constant_arrays=constant_arrays) # TODO: test.constant.arr1.set(arr1) test.set_constant('arr1', arr1) test.set_constant( 'arr2', thr.to_device(arr2)) # test that a device array is also accepted dest_dev = thr.array(global_size, numpy.float32) test(dest_dev) assert diff_is_negligible(dest_dev.get(), ref)
def test_ntt_lsh_method_performance(thread, single_kernel_bootstrap, heavy_performance_load, ntt_lsh_method): if thread.api.get_id() != cuda_id() and ntt_lsh_method == 'cuda_asm': pytest.skip() size = 4096 if heavy_performance_load else 64 rng = numpy.random.RandomState() secret_key, cloud_key = make_key_pair(thread, rng, transform_type='NTT') perf_params = performance_parameters( nufhe_params=secret_key.params, single_kernel_bootstrap=single_kernel_bootstrap, ntt_lsh_method=ntt_lsh_method) results = check_performance(thread, (secret_key, cloud_key), perf_params, shape=size) print() print(check_performance_str(results))
def test_fft_performance(thread, transforms_per_block, constant_memory, heavy_performance_load): if not transform_supported(thread.device_params, 'FFT'): pytest.skip() if transforms_per_block > max_supported_transforms_per_block(thread.device_params, 'FFT'): pytest.skip() is_cuda = thread.api.get_id() == cuda_id() batch_shape = (2**14,) a = get_test_array(batch_shape + (512,), numpy.complex128) kernel_repetitions = 100 if heavy_performance_load else 5 a_dev = thread.to_device(a) res_dev = thread.empty_like(a_dev) res_ref = tr_fft.fft_transform_ref(a) transform = fft512(use_constant_memory=constant_memory) fft_comp = Transform( transform, batch_shape, transforms_per_block=transforms_per_block, ).compile(thread) fft_comp_repeated = Transform( transform, batch_shape, transforms_per_block=transforms_per_block, kernel_repetitions=kernel_repetitions).compile(thread) # Quick check of correctness fft_comp(res_dev, a_dev) res_test = res_dev.get() assert numpy.allclose(res_test, res_ref) # Test performance times, times_str = get_times(thread, fft_comp_repeated, res_dev, a_dev) print("\n{backend}, {trnum} per block, test --- {times}".format( times=times_str, backend='cuda' if is_cuda else 'ocl ', trnum=transforms_per_block))
def __init__(self, tr_tree, translator, thread, fast_math, compiler_options, keep): """__init__()""" # hide the signature from Sphinx self._thread = thread self._is_cuda = (thread.api.get_id() == cuda_id()) self._tr_tree = tr_tree self._translator = translator self._fast_math = fast_math self._compiler_options = compiler_options self._keep = keep self._nested_comp_idgen = IdGen('_nested') self._persistent_value_idgen = IdGen('_value') self._constant_value_idgen = IdGen('_constant') self._temp_array_idgen = IdGen('_temp') self._external_annotations = self._tr_tree.get_root_annotations() self._persistent_values = {} self._constant_arrays = {} self._temp_arrays = set() self._internal_annotations = {} self._kernels = []
def performance_parameters_for_device(perf_params, device_params): is_cuda = device_params.api_id == cuda_id() ntt_base_method = perf_params.ntt_base_method ntt_mul_method = perf_params.ntt_mul_method ntt_lsh_method = perf_params.ntt_lsh_method if ntt_base_method is None: ntt_base_method = 'cuda_asm' if is_cuda else 'c' if ntt_mul_method is None: ntt_mul_method = 'cuda_asm' if is_cuda else 'c' if ntt_lsh_method is None: ntt_lsh_method = 'cuda_asm' if is_cuda else 'c' pdict = perf_params._asdict() pdict.update( ntt_base_method=ntt_base_method, ntt_mul_method=ntt_mul_method, ntt_lsh_method=ntt_lsh_method, ) return PerformanceParameters(**pdict)
def test_constant_memory_static_kernel(thr): global_size = 100 arr1 = get_test_array(global_size, numpy.int32) arr2 = get_test_array(global_size * 2, numpy.float32) ref = ((arr2[:global_size] + arr2[global_size:]) * arr1).astype(numpy.float32) src = """ KERNEL void test(GLOBAL_MEM float *dest) { const SIZE_T i = get_global_id(0); dest[i] = (arr2[i] + arr2[i + get_global_size(0)]) * arr1[i]; } """ constant_arrays = dict( arr1=arr1, # use the parameters of an existing array; the array itself is not copied! arr2=(global_size * 2, numpy.float32) ) # Compiled-in constant arrays only supported on CUDA if thr.api.get_id() != cluda.cuda_id(): with pytest.raises(ValueError): program = thr.compile_static( src, "test", global_size=global_size, constant_arrays=constant_arrays) return test = thr.compile_static(src, "test", global_size=global_size, constant_arrays=constant_arrays) # TODO: test.constant.arr1.set(arr1) test.set_constant('arr1', arr1) test.set_constant('arr2', thr.to_device(arr2)) # test that a device array is also accepted dest_dev = thr.array(global_size, numpy.float32) test(dest_dev) assert diff_is_negligible(dest_dev.get(), ref)
def test_ntt_performance(thread, transforms_per_block, constant_memory, heavy_performance_load): if not transform_supported(thread.device_params, 'NTT'): pytest.skip() if transforms_per_block > max_supported_transforms_per_block(thread.device_params, 'NTT'): pytest.skip() is_cuda = thread.api.get_id() == cuda_id() methods = list(itertools.product( ['cuda_asm', 'c'], # base method ['cuda_asm', 'c_from_asm', 'c'], # mul method ['cuda_asm', 'c_from_asm', 'c'] # lsh method )) if not is_cuda: # filter out all usage of CUDA asm if we're on OpenCL methods = [ms for ms in methods if 'cuda_asm' not in ms] batch_shape = (2**14,) a = get_test_array(batch_shape + (1024,), "ff_number") kernel_repetitions = 100 if heavy_performance_load else 5 a_dev = thread.to_device(a) res_dev = thread.empty_like(a_dev) # TODO: compute a reference NTT when it's fast enough on CPU #res_ref = tr_ntt.ntt_transform_ref(a) print() min_times = [] for base_method, mul_method, lsh_method in methods: transform = ntt1024( base_method=base_method, mul_method=mul_method, lsh_method=lsh_method, use_constant_memory=constant_memory) ntt_comp = Transform( transform, batch_shape, transforms_per_block=transforms_per_block, ).compile(thread) ntt_comp_repeated = Transform( transform, batch_shape, transforms_per_block=transforms_per_block, kernel_repetitions=kernel_repetitions).compile(thread) # TODO: compute a reference NTT when it's fast enough on CPU # Quick check of correctness #ntt_comp(res_dev, a_dev) #res_test = res_dev.get() #assert (res_test == res_ref).all() # Test performance times, times_str = get_times(thread, ntt_comp_repeated, res_dev, a_dev) print(" base: {bm}, mul: {mm}, lsh: {lm}".format( bm=base_method, mm=mul_method, lm=lsh_method)) print(" {backend}, {trnum} per block, test --- {times}".format( times=times_str, backend='cuda' if is_cuda else 'ocl ', trnum=transforms_per_block)) min_times.append((times.min(), base_method, mul_method, lsh_method)) best = min(min_times, key=lambda t: t[0]) time_best, base_method, mul_method, lsh_method = best print("Best time: {tb:.4f} for [base: {bm}, mul: {mm}, lsh: {lm}]".format( tb=time_best, bm=base_method, mm=mul_method, lm=lsh_method ))
def __init__(self, perf_params: PerformanceParameters, device_params): low_end_device = perf_params.low_end_device if low_end_device is None: # TODO: an arbitrary distinction, need to test on some devices close to it. low_end_device = device_params.compute_units < 20 is_cuda = device_params.api_id == cuda_id() transform_type = perf_params.nufhe_params.tgsw_params.tlwe_params.transform_type mask_size = perf_params.nufhe_params.tgsw_params.tlwe_params.mask_size decomp_length = perf_params.nufhe_params.tgsw_params.decomp_length use_constant_memory_multi_iter = perf_params.use_constant_memory_multi_iter if use_constant_memory_multi_iter is None: use_constant_memory_multi_iter = not low_end_device use_constant_memory_single_iter = perf_params.use_constant_memory_single_iter if use_constant_memory_single_iter is None: use_constant_memory_single_iter = False # Avoiding circular reference from .polynomial_transform import max_supported_transforms_per_block max_supported_tpb = max_supported_transforms_per_block(device_params, transform_type) transforms_per_block = perf_params.transforms_per_block if transforms_per_block is None: if low_end_device: transforms_per_block = 1 else: transforms_per_block = 4 if transform_type == 'NTT' else 2 transforms_per_block = min(transforms_per_block, max_supported_tpb) else: if transforms_per_block > max_supported_tpb: raise ValueError( "The chosen device does not support more than " + str(max_supported_tpb) + " transforms per block") # Avoiding circular reference from .blind_rotate import single_kernel_bootstrap_supported single_kernel_bootstrap = perf_params.single_kernel_bootstrap skbs_supported = single_kernel_bootstrap_supported(perf_params.nufhe_params, device_params) if single_kernel_bootstrap is None: # If both encryption parameters and device capabilities allow it, # single kernel bootstrap is the optimal choice. single_kernel_bootstrap = not low_end_device and skbs_supported elif single_kernel_bootstrap and not skbs_supported: single_kernel_bootstrap_supported( perf_params.nufhe_params, device_params, raise_exception=True) ntt_base_method = perf_params.ntt_base_method ntt_mul_method = perf_params.ntt_mul_method ntt_lsh_method = perf_params.ntt_lsh_method if 'cuda_asm' in (ntt_base_method, ntt_mul_method, ntt_lsh_method) and not is_cuda: raise ValueError("'cuda_asm' is only supported for the CUDA backend") if low_end_device: skbs = single_kernel_bootstrap if ntt_base_method is None: ntt_base_method = ('c' if skbs else 'cuda_asm') if is_cuda else 'c' if ntt_mul_method is None: ntt_mul_method = ('c_from_asm' if skbs else 'cuda_asm') if is_cuda else 'c' if ntt_lsh_method is None: ntt_lsh_method = 'cuda_asm' if is_cuda else 'c' else: if ntt_base_method is None: ntt_base_method = 'cuda_asm' if is_cuda else 'c' if ntt_mul_method is None: ntt_mul_method = 'cuda_asm' if is_cuda else 'c' if ntt_lsh_method is None: ntt_lsh_method = 'cuda_asm' if is_cuda else 'c' self.ntt_base_method = ntt_base_method self.ntt_mul_method = ntt_mul_method self.ntt_lsh_method = ntt_lsh_method self.use_constant_memory_multi_iter = use_constant_memory_multi_iter self.use_constant_memory_single_iter = use_constant_memory_single_iter self.transforms_per_block = transforms_per_block self.single_kernel_bootstrap = single_kernel_bootstrap
def get_id(): return cluda.cuda_id()
def test_sub(thread, method): if method == "cuda_asm" and thread.api.get_id() != cluda.cuda_id(): pytest.skip() check_func(thread, ntt.sub(method=method), ref_sub, 'ff_number', ['ff_number', 'ff_number'])
def test_mul_prepared(thread, method): if method == "cuda_asm" and thread.api.get_id() != cluda.cuda_id(): pytest.skip() check_func(thread, ntt.mul_prepared(method=method), ref_mul_prepared, 'ff_number', ['ff_number', 'ff_number'])