def uniform_integer(bijection, dtype, low, high=None): """ Generates uniformly distributed integer numbers in the interval ``[low, high)``. If ``high`` is ``None``, the interval is ``[0, low)``. Supported dtypes: any numpy integers. If the size of the interval is a power of 2, a fixed number of counters is used in each thread. Returns a :py:class:`~reikna.cbrng.samplers.Sampler` object. """ if high is None: low, high = 0, low + 1 else: assert low < high - 1 dtype = dtypes.normalize_type(dtype) ctype = dtypes.ctype(dtype) if dtype.kind == 'i': assert low >= -2**(dtype.itemsize * 8 - 1) assert high < 2**(dtype.itemsize * 8 - 1) else: assert low >= 0 assert high < 2**(dtype.itemsize * 8) num = high - low if num <= 2**32: raw_dtype = numpy.dtype('uint32') else: raw_dtype = numpy.dtype('uint64') raw_func = bijection.raw_functions[raw_dtype] max_num = 2**(raw_dtype.itemsize * 8) raw_ctype = dtypes.ctype(dtypes.normalize_type(raw_dtype)) module = Module(TEMPLATE.get_def("uniform_integer"), render_kwds=dict(bijection=bijection, dtype=dtype, ctype=ctype, raw_ctype=raw_ctype, raw_func=raw_func, max_num=max_num, num=num, low=low)) return Sampler(bijection, module, dtype, deterministic=(max_num % num == 0))
def test_offsets_in_kernel(thr): """ Check that kernels receive the base data of arrays and have to add offsets manually. """ global_size = 100 dest_offset = 4 src_offset = 2 dtype = dtypes.normalize_type(numpy.int32) program = thr.compile(""" KERNEL void test(GLOBAL_MEM int *dest, GLOBAL_MEM int *src) { const SIZE_T i = get_global_id(0); dest[i + ${dest_offset}] = src[i + ${src_offset}]; } """, render_kwds=dict(dest_offset=dest_offset, src_offset=src_offset)) test = program.test dest_dev_base = thr.array(global_size + dest_offset, dtype) dest_dev = thr.array( global_size, dtype, offset=dest_offset * dtype.itemsize, base=dest_dev_base) src_base = numpy.arange(global_size + src_offset).astype(dtype) src_dev_base = thr.to_device(src_base) src_dev = thr.array(global_size, dtype, offset=src_offset * dtype.itemsize, base=src_dev_base) test(dest_dev, src_dev, global_size=global_size) dest_ref = src_base[src_offset:] assert diff_is_negligible(dest_dev.get(), dest_ref)
def check_performance(thr_and_double, shape_and_axes): thr, double = thr_and_double dtype = numpy.complex128 if double else numpy.complex64 dtype = dtypes.normalize_type(dtype) shape, axes = shape_and_axes data = numpy.arange(product(shape)).reshape(shape).astype(dtype) shift = FFTShift(data, axes=axes) shiftc = shift.compile(thr) data_dev = thr.to_device(data) res_dev = thr.empty_like(data) attempts = 10 times = [] for i in range(attempts): t1 = time.time() shiftc(res_dev, data_dev) thr.synchronize() times.append(time.time() - t1) res_ref = numpy.fft.fftshift(data, axes=axes) assert diff_is_negligible(res_dev.get(), res_ref) return min(times), product(shape) * dtype.itemsize
def __init__(self, bijection, module, dtype, randoms_per_call=1, deterministic=False): """__init__()""" # hide the signature from Sphinx self.randoms_per_call = randoms_per_call self.dtype = dtypes.normalize_type(dtype) self.deterministic = deterministic self.bijection = bijection self.module = module
def get_test_array(shape, dtype, strides=None, no_zeros=False, high=None): shape = wrap_in_tuple(shape) dtype = dtypes.normalize_type(dtype) if dtype.names is not None: result = numpy.empty(shape, dtype) for name in dtype.names: result[name] = get_test_array(shape, dtype[name], no_zeros=no_zeros, high=high) else: if dtypes.is_integer(dtype): low = 1 if no_zeros else 0 if high is None: high = 100 # will work even with signed chars get_arr = lambda: numpy.random.randint(low, high, shape).astype(dtype) else: low = 0.01 if no_zeros else 0 if high is None: high = 1.0 get_arr = lambda: numpy.random.uniform(low, high, shape).astype(dtype) if dtypes.is_complex(dtype): result = get_arr() + 1j * get_arr() else: result = get_arr() if strides is not None: result = as_strided(result, result.shape, strides) return result
def __init__(self, dtype, shape=None, strides=None, offset=0, nbytes=None): self.shape = tuple() if shape is None else wrap_in_tuple(shape) self.size = product(self.shape) self.dtype = dtypes.normalize_type(dtype) self.ctype = dtypes.ctype_module(self.dtype) default_strides = helpers.default_strides(self.shape, self.dtype.itemsize) if strides is None: strides = default_strides else: strides = tuple(strides) self._default_strides = strides == default_strides self.strides = strides default_nbytes = helpers.min_buffer_size(self.shape, self.dtype.itemsize, self.strides) if nbytes is None: nbytes = default_nbytes self._default_nbytes = nbytes == default_nbytes self.nbytes = nbytes self.offset = offset self._cast = dtypes.cast(self.dtype)
def test_summation(thr): perf_size = 2**22 dtype = dtypes.normalize_type(numpy.int64) a = get_test_array(perf_size, dtype) a_dev = thr.to_device(a) rd = Reduce(a, predicate_sum(dtype)) b_dev = thr.empty_like(rd.parameter.output) b_ref = numpy.array([a.sum()], dtype) rdc = rd.compile(thr) attempts = 10 times = [] for i in range(attempts): t1 = time.time() rdc(b_dev, a_dev) thr.synchronize() times.append(time.time() - t1) assert diff_is_negligible(b_dev.get(), b_ref) return min(times), perf_size * dtype.itemsize
def array(self, shape, dtype, strides=None, offset=0, nbytes=None, allocator=None, base=None, base_data=None): if allocator is None: allocator = self.allocate dtype = dtypes.normalize_type(dtype) shape = wrap_in_tuple(shape) if nbytes is None: nbytes = min_buffer_size(shape, dtype.itemsize, strides=strides, offset=offset) if (offset != 0 or strides is not None) and base_data is None and base is None: base_data = allocator(nbytes) elif base is not None: base_data = base.data return Array(self, shape, dtype, strides=strides, offset=offset, allocator=allocator, base_data=base_data, nbytes=nbytes)
def test_summation(thr): perf_size = 2 ** 22 dtype = dtypes.normalize_type(numpy.int64) a = get_test_array(perf_size, dtype) a_dev = thr.to_device(a) rd = Reduce(a, predicate_sum(dtype)) b_dev = thr.empty_like(rd.parameter.output) b_ref = numpy.array([a.sum()], dtype) rdc = rd.compile(thr) attempts = 10 times = [] for i in range(attempts): t1 = time.time() rdc(b_dev, a_dev) thr.synchronize() times.append(time.time() - t1) assert diff_is_negligible(b_dev.get(), b_ref) return min(times), perf_size * dtype.itemsize
def get_test_array(shape, dtype, strides=None, no_zeros=False, high=None): shape = wrap_in_tuple(shape) dtype = dtypes.normalize_type(dtype) if dtype.names is not None: result = numpy.empty(shape, dtype) for name in dtype.names: result[name] = get_test_array(shape, dtype[name], no_zeros=no_zeros, high=high) else: if dtypes.is_integer(dtype): low = 1 if no_zeros else 0 if high is None: high = 100 # will work even with signed chars get_arr = lambda: numpy.random.randint(low, high, shape).astype( dtype) else: low = 0.01 if no_zeros else 0 if high is None: high = 1.0 get_arr = lambda: numpy.random.uniform(low, high, shape).astype( dtype) if dtypes.is_complex(dtype): result = get_arr() + 1j * get_arr() else: result = get_arr() if strides is not None: result = as_strided(result, result.shape, strides) return result
def padded(cls, dtype, shape, pad=0): """ Creates a :py:class:`Type` object corresponding to an array padded from all dimensions by `pad` elements. """ dtype = dtypes.normalize_type(dtype) strides, offset, nbytes = helpers.padded_buffer_parameters(shape, dtype.itemsize, pad=pad) return cls(dtype, shape, strides=strides, offset=offset, nbytes=nbytes)
def test_large_scan_performance(thr, large_perf_shape, exclusive): """ Large problem sizes. """ dtype = dtypes.normalize_type(numpy.int64) min_time = check_scan( thr, large_perf_shape, dtype=dtype, axes=None, exclusive=exclusive, measure_time=True) return min_time, helpers.product(large_perf_shape) * dtype.itemsize
def uniform_integer(bijection, dtype, low, high=None): """ Generates uniformly distributed integer numbers in the interval ``[low, high)``. If ``high`` is ``None``, the interval is ``[0, low)``. Supported dtypes: any numpy integers. If the size of the interval is a power of 2, a fixed number of counters is used in each thread. Returns a :py:class:`~reikna.cbrng.samplers.Sampler` object. """ if high is None: low, high = 0, low + 1 else: assert low < high - 1 dtype = dtypes.normalize_type(dtype) ctype = dtypes.ctype(dtype) if dtype.kind == 'i': assert low >= -2 ** (dtype.itemsize * 8 - 1) assert high < 2 ** (dtype.itemsize * 8 - 1) else: assert low >= 0 assert high < 2 ** (dtype.itemsize * 8) num = high - low if num <= 2 ** 32: raw_dtype = numpy.dtype('uint32') else: raw_dtype = numpy.dtype('uint64') raw_func = bijection.raw_functions[raw_dtype] max_num = 2 ** (raw_dtype.itemsize * 8) raw_ctype = dtypes.ctype(dtypes.normalize_type(raw_dtype)) module = Module( TEMPLATE.get_def("uniform_integer"), render_kwds=dict( bijection=bijection, dtype=dtype, ctype=ctype, raw_ctype=raw_ctype, raw_func=raw_func, max_num=max_num, num=num, low=low)) return Sampler(bijection, module, dtype, deterministic=(max_num % num == 0))
def padded(cls, dtype, shape, pad=0): """ Creates a :py:class:`Type` object corresponding to an array padded from all dimensions by `pad` elements. """ dtype = dtypes.normalize_type(dtype) strides, offset, nbytes = helpers.padded_buffer_parameters( shape, dtype.itemsize, pad=pad) return cls(dtype, shape, strides=strides, offset=offset, nbytes=nbytes)
def test_small_scan_performance(thr, exclusive, seq_size): """ Small problem sizes, big batches. """ dtype = dtypes.normalize_type(numpy.complex128) shape = (500, 2, 2, 512) min_time = check_scan( thr, shape, dtype=dtype, axes=(-1,), exclusive=exclusive, measure_time=True, seq_size=seq_size) return min_time, helpers.product(shape) * dtype.itemsize
def __init__(self, dtype, shape=None, strides=None): self.shape = tuple() if shape is None else wrap_in_tuple(shape) self.size = product(self.shape) self.dtype = dtypes.normalize_type(dtype) self.ctype = dtypes.ctype_module(self.dtype) if strides is None: self.strides = tuple([ self.dtype.itemsize * product(self.shape[i+1:]) for i in range(len(self.shape))]) else: self.strides = strides self._cast = dtypes.cast(self.dtype)
def test_large_scan_performance(thr, large_perf_shape, exclusive): """ Large problem sizes. """ dtype = dtypes.normalize_type(numpy.int64) min_time = check_scan(thr, large_perf_shape, dtype=dtype, axes=None, exclusive=exclusive, measure_time=True) return min_time, helpers.product(large_perf_shape) * dtype.itemsize
def philox(bitness, counter_words, rounds=10): """ A CBRNG based on a low number of slow rounds (multiplications). :param bitness: ``32`` or ``64``, corresponds to the size of generated random integers. :param counter_words: ``2`` or ``4``, number of integers generated in one go. :param rounds: ``1`` to ``12``, the more rounds, the better randomness is achieved. Default values are big enough to qualify as PRNG. :returns: a :py:class:`Bijection` object. """ W_CONSTANTS = { 64: [ numpy.uint64(0x9E3779B97F4A7C15), # golden ratio numpy.uint64(0xBB67AE8584CAA73B) # sqrt(3)-1 ], 32: [ numpy.uint32(0x9E3779B9), # golden ratio numpy.uint32(0xBB67AE85) # sqrt(3)-1 ] } M_CONSTANTS = { (64, 2): [numpy.uint64(0xD2B74407B1CE6E93)], (64, 4): [numpy.uint64(0xD2E7470EE14C6C93), numpy.uint64(0xCA5A826395121157)], (32, 2): [numpy.uint32(0xD256D193)], (32, 4): [numpy.uint32(0xD2511F53), numpy.uint32(0xCD9E8D57)] } assert 1 <= rounds <= 12 word_dtype = dtypes.normalize_type(numpy.uint32 if bitness == 32 else numpy.uint64) key_words = counter_words // 2 key_dtype, key_ctype, counter_dtype, counter_ctype = create_struct_types( word_dtype, key_words, counter_words) module = Module(TEMPLATE.get_def("philox"), render_kwds=dict(word_dtype=word_dtype, word_ctype=dtypes.ctype(word_dtype), key_words=key_words, counter_words=counter_words, key_ctype=key_ctype, counter_ctype=counter_ctype, rounds=rounds, w_constants=W_CONSTANTS[bitness], m_constants=M_CONSTANTS[(bitness, counter_words)])) return Bijection(module, word_dtype, key_dtype, counter_dtype)
def __init__(self, dtype, shape=None, strides=None): self.shape = tuple() if shape is None else wrap_in_tuple(shape) self.size = product(self.shape) self.dtype = dtypes.normalize_type(dtype) self.ctype = dtypes.ctype_module(self.dtype) if strides is None: self.strides = tuple([ self.dtype.itemsize * product(self.shape[i + 1:]) for i in range(len(self.shape)) ]) else: self.strides = strides self._cast = dtypes.cast(self.dtype)
def test_small_scan_performance(thr, exclusive, seq_size): """ Small problem sizes, big batches. """ dtype = dtypes.normalize_type(numpy.complex128) shape = (500, 2, 2, 512) min_time = check_scan(thr, shape, dtype=dtype, axes=(-1, ), exclusive=exclusive, measure_time=True, seq_size=seq_size) return min_time, helpers.product(shape) * dtype.itemsize
def single_kernel_bootstrap_supported(nufhe_params, device_params, raise_exception=False): if device_params.api_id == ocl_id(): # OpenCL uses some local memory for kernel arguments if there are many of them, # and we need all the available local memory for internal buffers. if raise_exception: raise ValueError("Single-kernel bootstrap is not supported for OpenCL") else: return False transform_type = nufhe_params.tgsw_params.tlwe_params.transform_type reqs = get_transform(transform_type).transform_module_requirements() mask_size = nufhe_params.tgsw_params.tlwe_params.mask_size decomp_length = nufhe_params.tgsw_params.decomp_length if not (mask_size == 1 and decomp_length == 2): if raise_exception: raise ValueError( "Single-kernel bootstrap is only supported for mask_size=1 and decomp_length=2") else: return False skb_transforms = (mask_size + 1) * decomp_length threads_per_transform = reqs['threads_per_transform'] max_work_group_size = device_params.max_work_group_size if not threads_per_transform * skb_transforms <= max_work_group_size: if raise_exception: raise ValueError( "The chosen device does not support a block/workgroup size big enough " "to run single-kernel bootstrap") else: return False tr_size = reqs['transform_length'] * reqs['elem_dtype_itemsize'] temp_size = reqs['temp_length'] * reqs['temp_dtype_itemsize'] poly_dtype_itemsize = dtypes.normalize_type(Torus32).itemsize sh_size = max(tr_size, temp_size) required_lmem_size = ( sh_size * ((mask_size + 1) * decomp_length + mask_size) + (mask_size + 1) * reqs['polynomial_length'] * poly_dtype_itemsize) if required_lmem_size > device_params.local_mem_size: if raise_exception: raise ValueError( "The chosen device does not have enough shared/local memory " "to run single-kernel bootstrap") else: return False return True
def philox(bitness, counter_words, rounds=10): """ A CBRNG based on a low number of slow rounds (multiplications). :param bitness: ``32`` or ``64``, corresponds to the size of generated random integers. :param counter_words: ``2`` or ``4``, number of integers generated in one go. :param rounds: ``1`` to ``12``, the more rounds, the better randomness is achieved. Default values are big enough to qualify as PRNG. :returns: a :py:class:`Bijection` object. """ W_CONSTANTS = { 64: [ numpy.uint64(0x9E3779B97F4A7C15), # golden ratio numpy.uint64(0xBB67AE8584CAA73B) # sqrt(3)-1 ], 32: [ numpy.uint32(0x9E3779B9), # golden ratio numpy.uint32(0xBB67AE85) # sqrt(3)-1 ] } M_CONSTANTS = { (64,2): [numpy.uint64(0xD2B74407B1CE6E93)], (64,4): [numpy.uint64(0xD2E7470EE14C6C93), numpy.uint64(0xCA5A826395121157)], (32,2): [numpy.uint32(0xD256D193)], (32,4): [numpy.uint32(0xD2511F53), numpy.uint32(0xCD9E8D57)] } assert 1 <= rounds <= 12 word_dtype = dtypes.normalize_type(numpy.uint32 if bitness == 32 else numpy.uint64) key_words = counter_words // 2 key_dtype, key_ctype, counter_dtype, counter_ctype = create_struct_types( word_dtype, key_words, counter_words) module = Module( TEMPLATE.get_def("philox"), render_kwds=dict( word_dtype=word_dtype, word_ctype=dtypes.ctype(word_dtype), key_words=key_words, counter_words=counter_words, key_ctype=key_ctype, counter_ctype=counter_ctype, rounds=rounds, w_constants=W_CONSTANTS[bitness], m_constants=M_CONSTANTS[(bitness, counter_words)])) return Bijection(module, word_dtype, key_dtype, counter_dtype)
def array(self, shape, dtype, strides=None, offset=0, nbytes=None, allocator=None, base=None, base_data=None): # In PyCUDA, the default allocator is not None, but a default alloc object if allocator is None: allocator = cuda.mem_alloc dtype = dtypes.normalize_type(dtype) shape = wrap_in_tuple(shape) if nbytes is None: nbytes = int( min_buffer_size(shape, dtype.itemsize, strides=strides, offset=offset)) if (offset != 0 or strides is not None) and base_data is None and base is None: base_data = allocator(nbytes) elif base is not None: if isinstance(base, Array): base_data = base.base_data else: base_data = base.gpudata return Array(self, shape, dtype, strides=strides, allocator=allocator, offset=offset, base_data=base_data, nbytes=nbytes)
def test_offsets_in_kernel(thr): """ Check that kernels receive the base data of arrays and have to add offsets manually. """ global_size = 100 dest_offset = 4 src_offset = 2 dtype = dtypes.normalize_type(numpy.int32) program = thr.compile(""" KERNEL void test(GLOBAL_MEM int *dest, GLOBAL_MEM int *src) { const SIZE_T i = get_global_id(0); dest[i + ${dest_offset}] = src[i + ${src_offset}]; } """, render_kwds=dict(dest_offset=dest_offset, src_offset=src_offset)) test = program.test dest_dev_base = thr.array(global_size + dest_offset, dtype) dest_dev = thr.array(global_size, dtype, offset=dest_offset * dtype.itemsize, base=dest_dev_base) src_base = numpy.arange(global_size + src_offset).astype(dtype) src_dev_base = thr.to_device(src_base) src_dev = thr.array(global_size, dtype, offset=src_offset * dtype.itemsize, base=src_dev_base) test(dest_dev, src_dev, global_size=global_size) dest_ref = src_base[src_offset:] assert diff_is_negligible(dest_dev.get(), dest_ref)
def array( self, shape, dtype, strides=None, offset=0, nbytes=None, allocator=None, base=None, base_data=None): # In PyCUDA, the default allocator is not None, but a default alloc object if allocator is None: allocator = cuda.mem_alloc dtype = dtypes.normalize_type(dtype) shape = wrap_in_tuple(shape) if nbytes is None: nbytes = int(min_buffer_size(shape, dtype.itemsize, strides=strides, offset=offset)) if (offset != 0 or strides is not None) and base_data is None and base is None: base_data = allocator(nbytes) elif base is not None: if isinstance(base, Array): base_data = base.base_data else: base_data = base.gpudata return Array( self, shape, dtype, strides=strides, allocator=allocator, offset=offset, base_data=base_data, nbytes=nbytes)
def threefry(bitness, counter_words, rounds=20): """ A CBRNG based on a big number of fast rounds (bit rotations). :param bitness: ``32`` or ``64``, corresponds to the size of generated random integers. :param counter_words: ``2`` or ``4``, number of integers generated in one go. :param rounds: ``1`` to ``72``, the more rounds, the better randomness is achieved. Default values are big enough to qualify as PRNG. :returns: a :py:class:`Bijection` object. """ ROTATION_CONSTANTS = { # These are the R_256 constants from the Threefish reference sources # with names changed to R_64x4... (64, 4): numpy.array([[14, 52, 23, 5, 25, 46, 58, 32], [16, 57, 40, 37, 33, 12, 22, 32]]).T, # Output from skein_rot_search: (srs64_B64-X1000) # Random seed = 1. BlockSize = 128 bits. sampleCnt = 1024. rounds = 8, minHW_or=57 # Start: Tue Mar 1 10:07:48 2011 # rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format (64, 2): numpy.array([[16, 42, 12, 31, 16, 32, 24, 21]]).T, # 4 rounds: minHW = 4 [ 4 4 4 4 ] # 5 rounds: minHW = 8 [ 8 8 8 8 ] # 6 rounds: minHW = 16 [ 16 16 16 16 ] # 7 rounds: minHW = 32 [ 32 32 32 32 ] # 8 rounds: minHW = 64 [ 64 64 64 64 ] # 9 rounds: minHW = 64 [ 64 64 64 64 ] # 10 rounds: minHW = 64 [ 64 64 64 64 ] # 11 rounds: minHW = 64 [ 64 64 64 64 ] # Output from skein_rot_search: (srs-B128-X5000.out) # Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28 # Start: Mon Aug 24 22:41:36 2009 # ... # rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format (32, 4): numpy.array([[10, 11, 13, 23, 6, 17, 25, 18], [26, 21, 27, 5, 20, 11, 10, 20]]).T, # 4 rounds: minHW = 3 [ 3 3 3 3 ] # 5 rounds: minHW = 7 [ 7 7 7 7 ] # 6 rounds: minHW = 12 [ 13 12 13 12 ] # 7 rounds: minHW = 22 [ 22 23 22 23 ] # 8 rounds: minHW = 31 [ 31 31 31 31 ] # 9 rounds: minHW = 32 [ 32 32 32 32 ] # 10 rounds: minHW = 32 [ 32 32 32 32 ] # 11 rounds: minHW = 32 [ 32 32 32 32 ] # Output from skein_rot_search (srs32x2-X5000.out) # Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28 # Start: Tue Jul 12 11:11:33 2011 # rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize= 64].format (32, 2): numpy.array([[13, 15, 26, 6, 17, 29, 16, 24]]).T # 4 rounds: minHW = 4 [ 4 4 4 4 ] # 5 rounds: minHW = 6 [ 6 8 6 8 ] # 6 rounds: minHW = 9 [ 9 12 9 12 ] # 7 rounds: minHW = 16 [ 16 24 16 24 ] # 8 rounds: minHW = 32 [ 32 32 32 32 ] # 9 rounds: minHW = 32 [ 32 32 32 32 ] # 10 rounds: minHW = 32 [ 32 32 32 32 ] # 11 rounds: minHW = 32 [ 32 32 32 32 ] } # Taken from Skein PARITY_CONSTANTS = { 64: numpy.uint64(0x1BD11BDAA9FC1A22), 32: numpy.uint32(0x1BD11BDA) } assert 1 <= rounds <= 72 word_dtype = dtypes.normalize_type(numpy.uint32 if bitness == 32 else numpy.uint64) key_words = counter_words key_dtype, key_ctype, counter_dtype, counter_ctype = create_struct_types( word_dtype, key_words, counter_words) module = Module(TEMPLATE.get_def("threefry"), render_kwds=dict( word_dtype=word_dtype, word_ctype=dtypes.ctype(word_dtype), key_words=key_words, counter_words=counter_words, key_ctype=key_ctype, counter_ctype=counter_ctype, rounds=rounds, rotation_constants=ROTATION_CONSTANTS[(bitness, counter_words)], parity_constant=PARITY_CONSTANTS[bitness])) return Bijection(module, word_dtype, key_dtype, counter_dtype)
def __init__(self, module, dtype, components=1): self.module = module self.components = components self.dtype = dtypes.normalize_type(dtype)
def __init__(self, module, dtype, components=1, noise_sources=1, real_noise=False): self.module = module self.components = components self.noise_sources = noise_sources self.real_noise = real_noise self.dtype = dtypes.normalize_type(dtype)
def threefry(bitness, counter_words, rounds=20): """ A CBRNG based on a big number of fast rounds (bit rotations). :param bitness: ``32`` or ``64``, corresponds to the size of generated random integers. :param counter_words: ``2`` or ``4``, number of integers generated in one go. :param rounds: ``1`` to ``72``, the more rounds, the better randomness is achieved. Default values are big enough to qualify as PRNG. :returns: a :py:class:`Bijection` object. """ ROTATION_CONSTANTS = { # These are the R_256 constants from the Threefish reference sources # with names changed to R_64x4... (64, 4): numpy.array([[14, 52, 23, 5, 25, 46, 58, 32], [16, 57, 40, 37, 33, 12, 22, 32]]).T, # Output from skein_rot_search: (srs64_B64-X1000) # Random seed = 1. BlockSize = 128 bits. sampleCnt = 1024. rounds = 8, minHW_or=57 # Start: Tue Mar 1 10:07:48 2011 # rMin = 0.136. #0325[*15] [CRC=455A682F. hw_OR=64. cnt=16384. blkSize= 128].format (64, 2): numpy.array([[16, 42, 12, 31, 16, 32, 24, 21]]).T, # 4 rounds: minHW = 4 [ 4 4 4 4 ] # 5 rounds: minHW = 8 [ 8 8 8 8 ] # 6 rounds: minHW = 16 [ 16 16 16 16 ] # 7 rounds: minHW = 32 [ 32 32 32 32 ] # 8 rounds: minHW = 64 [ 64 64 64 64 ] # 9 rounds: minHW = 64 [ 64 64 64 64 ] # 10 rounds: minHW = 64 [ 64 64 64 64 ] # 11 rounds: minHW = 64 [ 64 64 64 64 ] # Output from skein_rot_search: (srs-B128-X5000.out) # Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28 # Start: Mon Aug 24 22:41:36 2009 # ... # rMin = 0.472. #0A4B[*33] [CRC=DD1ECE0F. hw_OR=31. cnt=16384. blkSize= 128].format (32, 4): numpy.array([[10, 11, 13, 23, 6, 17, 25, 18], [26, 21, 27, 5, 20, 11, 10, 20]]).T, # 4 rounds: minHW = 3 [ 3 3 3 3 ] # 5 rounds: minHW = 7 [ 7 7 7 7 ] # 6 rounds: minHW = 12 [ 13 12 13 12 ] # 7 rounds: minHW = 22 [ 22 23 22 23 ] # 8 rounds: minHW = 31 [ 31 31 31 31 ] # 9 rounds: minHW = 32 [ 32 32 32 32 ] # 10 rounds: minHW = 32 [ 32 32 32 32 ] # 11 rounds: minHW = 32 [ 32 32 32 32 ] # Output from skein_rot_search (srs32x2-X5000.out) # Random seed = 1. BlockSize = 64 bits. sampleCnt = 1024. rounds = 8, minHW_or=28 # Start: Tue Jul 12 11:11:33 2011 # rMin = 0.334. #0206[*07] [CRC=1D9765C0. hw_OR=32. cnt=16384. blkSize= 64].format (32, 2): numpy.array([[13, 15, 26, 6, 17, 29, 16, 24]]).T # 4 rounds: minHW = 4 [ 4 4 4 4 ] # 5 rounds: minHW = 6 [ 6 8 6 8 ] # 6 rounds: minHW = 9 [ 9 12 9 12 ] # 7 rounds: minHW = 16 [ 16 24 16 24 ] # 8 rounds: minHW = 32 [ 32 32 32 32 ] # 9 rounds: minHW = 32 [ 32 32 32 32 ] # 10 rounds: minHW = 32 [ 32 32 32 32 ] # 11 rounds: minHW = 32 [ 32 32 32 32 ] } # Taken from Skein PARITY_CONSTANTS = { 64: numpy.uint64(0x1BD11BDAA9FC1A22), 32: numpy.uint32(0x1BD11BDA) } assert 1 <= rounds <= 72 word_dtype = dtypes.normalize_type(numpy.uint32 if bitness == 32 else numpy.uint64) key_words = counter_words key_dtype, key_ctype, counter_dtype, counter_ctype = create_struct_types( word_dtype, key_words, counter_words) module = Module( TEMPLATE.get_def("threefry"), render_kwds=dict( word_dtype=word_dtype, word_ctype=dtypes.ctype(word_dtype), key_words=key_words, counter_words=counter_words, key_ctype=key_ctype, counter_ctype=counter_ctype, rounds=rounds, rotation_constants=ROTATION_CONSTANTS[(bitness, counter_words)], parity_constant=PARITY_CONSTANTS[bitness])) return Bijection(module, word_dtype, key_dtype, counter_dtype)
to_device = (to_device1, to_device2) from_device = (from_device1, from_device2, from_device3, from_device4, from_device5) for to_d, from_d in itertools.product(to_device, from_device): a_device = to_d(a) a_copy = thr.copy_array(a_device) a_back = from_d(a_copy) assert diff_is_negligible(a, a_back) @pytest.mark.parametrize( "dtype", TEST_DTYPES, ids=[dtypes.normalize_type(dtype).name for dtype in TEST_DTYPES]) def test_dtype_support(thr, dtype): # Test passes if either thread correctly reports that it does not support given dtype, # or it successfully compiles kernel that operates with this dtype. N = 256 if not thr.device_params.supports_dtype(dtype): pytest.skip() mul = functions.mul(dtype, dtype) div = functions.div(dtype, dtype) program = thr.compile(""" KERNEL void test( GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *a, GLOBAL_MEM ${ctype} *b) {
thr.synchronize() return y to_device = (to_device1, to_device2) from_device = (from_device1, from_device2, from_device3, from_device4, from_device5) for to_d, from_d in itertools.product(to_device, from_device): a_device = to_d(a) a_copy = thr.copy_array(a_device) a_back = from_d(a_copy) assert diff_is_negligible(a, a_back) @pytest.mark.parametrize( "dtype", TEST_DTYPES, ids=[dtypes.normalize_type(dtype).name for dtype in TEST_DTYPES]) def test_dtype_support(thr, dtype): # Test passes if either thread correctly reports that it does not support given dtype, # or it successfully compiles kernel that operates with this dtype. N = 256 if not thr.device_params.supports_dtype(dtype): pytest.skip() mul = functions.mul(dtype, dtype) div = functions.div(dtype, dtype) program = thr.compile( """ KERNEL void test( GLOBAL_MEM ${ctype} *dest, GLOBAL_MEM ${ctype} *a, GLOBAL_MEM ${ctype} *b)