def __init__(self, mode='a', b_create_kernel_file: bool = True): self.mode = mode self.buff = zeros((10, ), Types.short) self.data_t = self.buff.dtype func = Function('plus_one', { 'buffer': Global(self.buff.dtype), 'idx': Scalar(Types.int) }, [ """ return buffer[idx]+${some_integer}; """ ], {'some_integer': 5}, returns=Types.float) knl = Kernel('some_operation', { 'buff': Global(self.buff), 'number': Scalar(Types.short(3.0)) }, [ """ data_t factor = convert_${data_t}(1.3); buff[get_global_id(0)] = plus_one(buff, get_global_id(0)) + SOME_CONSTANT*factor; """ ], replacements={'data_t': c_name_from_dtype(self.data_t)}, global_size=self.buff.shape) defines = {'SOME_CONSTANT': 6} type_defs = {'data_t': self.data_t} self.program = Program(defines=defines, type_defs=type_defs, functions=[func], kernels=[knl]).compile() self.knl = knl
def get_fft_stage_func(self): replacements = {'fft_radix_R': f'fft_radix_{self.radix}'} func_fft_iteration = Function( 'fft_stage', { # thread index: T radix R fft ops are done in parallel 't': Scalar(Types.int), # thread block index: identifies block of N/(R*T) blocks. 'b': Scalar(Types.int), 'Ns': Scalar(Types.int), 'data0': Local(self.data_t) if self.b_local_memory else Global( self.data_t, read_only=True), 'data1': Local(self.data_t) if self.b_local_memory else Global(self.data_t), 'shared': Local(self.real_t), 'direction': Scalar(Types.int), 'iteration': Scalar(Types.int) }, """ int j = b*T + t; // as proposed in [1, p.3, text section A] private data_t v[R]; real_t angle = -2*PI*(j % Ns)/(Ns*R); for(int r=0; r<R; r++){ int idxS = j + r* N/R; // idxS=idxSource v_from_data0(v, data0, idxS, r, direction,iteration); real_t cos_angle = cos(r*angle); real_t sin_angle = sin(r*angle); v[r] = (data_t)(v[r].s0*cos_angle-v[r].s1 *sin_angle, v[r].s0*sin_angle+v[r].s1 *cos_angle); } ${fft_radix_R}(v); if(Ns>=CW){ // todo: remove condition and store as separate kernel // changed line 27 of [1, Figure 2] to work with global dimensions of this class: int offset = expand(j, Ns, R); for(int r=0; r<R; r++){ int idxD = offset + r*Ns; // idxD=idxDestination data_1_from_v(v, data1, idxD, r, direction, iteration); } }else{ // According to [1, p.4], such that global memory writes are coalesced // !! mistake in [1] where *Ns is missing: idxD = (int)(t/Ns)*R + (t%Ns); !! int idxD = (int)(t/Ns)*Ns*R + (t%Ns); exchange( v, idxD, Ns, t, T, shared); int offset = b*R*T+ t; for( int r=0; r<R; r++ ){ idxD = offset + r*T; data_1_from_v(v, data1, idxD, r, direction, iteration); } } """, replacements=replacements) return func_fft_iteration
def _get_kernel(stage_builder: FftStageBuilder, iteration, radix, data_in, data_out, emulate=False): if iteration == 0: # data0 is in_buffer, whose length might not be power of two offset_data_in = 'get_global_id(0)*N_INPUT' else: offset_data_in = 'get_global_id(0)*N' stage_builder.radix = radix funcs, type_defs, d, shared_mem_fft_stage = stage_builder.get_funcs_for_all_stages( [radix]) funcs.append(stage_builder.get_fft_stage_func()) d['M'] = (M := data_in.shape[0]) # number of FFTs to perform simultaneously knl_gpu_fft = Kernel( 'gpu_fft', { 'Ns': Scalar(Types.int), # In each iteration, the algorithm can be thought of combining the radix R FFTs on # subsequences of length Ns into the FFT of a new sequence of length RNs by # performing an FFT of length R on the corresponding elements of the subsequences. 'data_in': Global(data_in), 'data_out': Global(data_out), 'direction': Scalar(Types.int), 'iteration': Scalar(Types.int) }, """ ${shared_mem_fft_stage} int t = get_local_id(2); // thread index int b = get_global_id(1); // thread block index fft_stage(t, b, Ns, data_in +${offset_data_in}, data_out +get_global_id(0)*N, shared,direction, iteration);""", replacements={ 'offset_data_in': offset_data_in, 'shared_mem_fft_stage': shared_mem_fft_stage }, global_size=(d['M'], max(1, int(d['N'] / (d['R'] * d['T']))), d['T']), local_size=(1, 1, d['T'])) program = Program( funcs, [knl_gpu_fft], defines=d, type_defs=type_defs).compile( context=data_in.context, emulate=emulate, file=Program.get_default_dir_pycl_kernels().joinpath( f'fft_{iteration}_{stage_builder.radix}')) return program.gpu_fft
def _get_cl_program(self) -> Program: knl = Kernel(name='sum_along_axis', args={ 'in_buffer': Global(self.in_buffer), 'axis': Scalar(Types.int(self.axis)), 'out_buffer': Global(self.out_buffer) }, body=[ """ buff_t sum = (buff_t) 0; for(int i=0; i<${size_input_axis}; i++){// i == glob_id_axis sum+=in_buffer[${addr_in}]; } out_buffer[${addr}] = sum; """ ], replacements={ 'size_input_axis': self.in_buffer.shape[self.axis], 'addr': Helpers.command_compute_address(self.out_buffer.ndim), 'addr_in': self._command_compute_address_in() }, global_size=self.out_buffer.shape) type_defs = {'buff_t': self.in_buffer.dtype} return Program(type_defs=type_defs, kernels=[knl])
def __init__(self, b_create_kernel_file: bool = True): self.buff = zeros((10, ), Types.short) self.knl = Kernel('some_operation', { 'buff': Global(self.buff), 'number': Scalar(Types.short(1)) }, [ """ buff[get_global_id(0)] = number; """ ], global_size=self.buff.shape).compile()
def test_two_input_integer_functions(name, dtype): a_cl = to_device(np.ones((10, ), dtype)) a_emulation = to_device(np.ones((10, ), dtype)) knl = Kernel(f'knl_{name}', { 'a': Global(a_cl), 'num': Scalar(dtype(0)) }, f'a[get_global_id(0)]={name}(a[get_global_id(0)], num);', global_size=a_cl.shape) knl.compile()(a=a_cl) knl.compile(emulate=True)(a=a_emulation) assert np.all(a_cl.get() == a_emulation.get())
def test_add_functions_inside_function_or_kernel_definition(): ary_a = to_device(np.ones(100)) fnc_add3 = Function('add_three', {'a': Scalar(Types.int)}, 'return a + 3;', returns=Types.int) fnc_add5 = Function('add_five', {'a': Scalar(Types.int)}, """ return add_three(a)+2; """, functions=[fnc_add3], returns=Types.int) some_knl = Kernel('some_knl', {'ary_a': Global(ary_a)}, """ ary_a[get_global_id(0)] = add_five(ary_a[get_global_id(0)]); """, global_size=ary_a.shape, functions=[fnc_add5]) functions = [ ] # funcitons defined here have higher proiority in case of name conflicts Program(functions=functions, kernels=[some_knl]).compile() some_knl() assert ary_a.get()[0] == 6
def get_funcs_radix8(data_t): funcs_mulpxpy_8 = [ Function(k, {'a': Scalar(data_t)}, v, returns=data_t) for k, v in mul_pxpy_dict8.items() ] func_fft_radix_8 = Function( 'fft_radix_8', {'v': Private(data_t)}, """ // 4x in-place DFT2 data_t u0 = v[0]; data_t u1 = v[1]; data_t u2 = v[2]; data_t u3 = v[3]; data_t u4 = v[4]; data_t u5 = v[5]; data_t u6 = v[6]; data_t u7 = v[7]; data_t v0 = u0 + u4; data_t v4 = mul_p0q4(u0 - u4); data_t v1 = u1 + u5; data_t v5 = mul_p1q4(u1 - u5); data_t v2 = u2 + u6; data_t v6 = mul_p2q4(u2 - u6); data_t v3 = u3 + u7; data_t v7 = mul_p3q4(u3 - u7); // 4x in-place DFT2 and twiddle u0 = v0 + v2; u2 = mul_p0q2(v0 - v2); u1 = v1 + v3; u3 = mul_p1q2(v1 - v3); u4 = v4 + v6; u6 = mul_p0q2(v4 - v6); u5 = v5 + v7; u7 = mul_p1q2(v5 - v7); // 4x DFT2 and store (reverse binary permutation) v[0] = u0 + u1; v[1] = u4 + u5; v[2] = u2 + u3; v[3] = u6 + u7; v[4] = u0 - u1; v[5] = u4 - u5; v[6] = u2 - u3; v[7] = u6 - u7; """) return funcs_mulpxpy_8, func_fft_radix_8
def test_macro_with_arguments(): defines = { 'FUNC(a,b,c)': '{ int tmp = c(a-b); a += b + tmp; }' } # this is a macro with arguments ary = zeros((2, ), Types.int) func_add_two = Function('add_two', {'a': Scalar(Types.int)}, 'return a + 2;', returns=Types.int) knl = Kernel('knl_macro_func', {'ary': Global(ary)}, """ int a = 1; int b = 2; FUNC(a, b, add_two) ary[get_global_id(0)] = a; """, defines=defines, global_size=ary.shape) Program([func_add_two], [knl]).compile().knl_macro_func() assert np.allclose(ary.get(), np.array([4, 4]).astype(ary.dtype))
def get_funcs_radix4(data_t): # mul_pxqy(a) returns a*exp(-j * PI * p / q) where p=x and q=y funcs_mulpxpy_4 = [ Function(k, {'a': Scalar(data_t)}, v, returns=data_t) for k, v in mul_pxpy_dict4.items() ] func_fft_radix_4 = Function( 'fft_radix_4', {'v': Private(data_t)}, """ // 2x DFT2 and twiddle data_t v0 = v[0] + v[2]; data_t v1 = v[0] - v[2]; data_t v2 = v[1] + v[3]; data_t v3 = mul_p1q2(v[1] - v[3]); // twiddle // 2x DFT2 and store v[0] = v0 + v2; v[1] = v1 + v3; v[2] = v0 - v2; v[3] = v1 - v3; """) return funcs_mulpxpy_4, func_fft_radix_4
def test_conversion_knl_fnc_args_with_no_pointer_format(): a_np = np.array([0.1, 0.2], dtype=Types.float) b_cl = zeros(shape=(2, ), dtype=Types.float) fnc = Function( 'copy_fnc', { 'a': a_np, 'b': b_cl, 'idx': Scalar(Types.int) }, """ b[idx] = a[idx]; """) knl = Kernel('some_knl', { 'a': a_np, 'b': b_cl }, """ copy_fnc(a, b, get_global_id(0)); """, functions=[fnc], global_size=b_cl.shape) knl.compile() knl() assert np.all(a_np == b_cl.get())
def cl_set(array: Array, region: TypeSliceFormatCopyArrayRegion, value): """ example usage: set slice of array with scalar value val = 1 cl_set(ary, Slice[:,2:3], val) set slice of array with equally shaped numpy array like the slice some_np_array = np.array([[3,4]) cl_set(ary, Slice[1:2,2:3], some_np_array) :param array: :param region: :param value: :return: """ # todo test if array c contiguous region_arg = region # if slice is contiguous block of memory set it as # _buffer_np = np.zeros_like(add_symbols_memory_initialization.out_buffer) # _buffer_np[:, memory: -memory] = mapper.alphabet[0] # add_symbols_memory_initialization.out_buffer.set(_buffer_np) region = CopyArrayRegion._deal_with_incomplete_regions(region_arg, array) region = CopyArrayRegion._deal_with_none_in_stop(region, array) region = CopyArrayRegion._deal_with_negative_region(region, array) # test if requested region is for axis, _slice in enumerate(region): step_width = _slice[2] if abs(_slice[0] * step_width) > array.shape[axis] or abs(_slice[1] * step_width) > array.shape[axis]: raise ValueError('Slicing out of array bounds') if any([(part[0] - part[1]) == 0 for part in region]): # check that there is no empty slice return global_size = np.product([part[1] - part[0] for part in region]) target_shape = to_device(np.array(array.shape).astype(Types.int)) offset_target = to_device(np.array([part[0] for part in region]).astype(Types.int)) source_shape = to_device(np.array([part[1] - part[0] for part in region]).astype(Types.int)) source_n_dims = len(source_shape) if isinstance(value, np.ndarray): source = to_device(value.astype(array.dtype)) arg_source = Global(source) code_source = 'source[get_global_id(0)]' else: arg_source = Scalar(array.dtype) source = value code_source = 'source' knl = Kernel('set_cl_array', {'target': Global(array), 'target_shape': Global(target_shape), 'offset_target': Global(offset_target), 'source': arg_source, 'source_shape': Global(source_shape), 'source_n_dims': Scalar(Types.int)}, """ // id_source = get_global_id(0) // id_source points to element of array source which replaces element with id_target in array target. // we need to compute id_target from id_source: // we assume c-contiguous addressing like: // id_source = id0*s1*s2*s3+id1*s2*s3+id2*s3+id3 (here s refers shape of source array) // At first we need to compute individual ids of source array from id_source: // id3 = int(gid % s3), temp = (gid-id3)/s3 // id2 = int(temp % s2), temp = (temp-id2)/s2 // id1 = int(temp % s1), temp = (temp-id1)/s1 // id0 = int(temp % s0), temp = (temp-id0)/s1 // Finally, we can determine the id of the target array and copy element to corresponding position: // id_target = (id0*offset0t)*s1t*s2t ... (sxt: shape of target array along dim x) int id_target = 0; // to be iteratively computed from global id, slice dimensions and ary dimensions int temp = get_global_id(0); int prod_source_id_multiplier = 1; int prod_target_id_multiplier = 1; for(int i=source_n_dims-1; i>=0; i--){ // i=i_axis_source int id_source = temp % source_shape[i]; temp = (int)((temp-id_source)/source_shape[i]); prod_source_id_multiplier *= source_shape[i]; id_target += (offset_target[i]+id_source)*prod_target_id_multiplier; prod_target_id_multiplier *= target_shape[i]; } target[id_target] = ${source}; """, replacements={'addr': Helpers.command_compute_address(array.ndim), 'source': code_source}, global_size=(global_size,) ).compile(array.context, emulate=False) knl(source=source, source_n_dims=source_n_dims)
def get_funcs_radix16(data_t): func_mul1 = Function( 'mul_1', { 'a': Scalar(data_t), 'b': Scalar(data_t) }, 'data_t x; x.even = MUL_RE(a,b); x.odd = MUL_IM(a,b); return x;', returns=data_t, defines={ 'MUL_RE(a,b)': '(a.even*b.even - a.odd*b.odd)', 'MUL_IM(a,b)': '(a.even*b.odd + a.odd*b.even)' }) funcs_mulpxpy_16 = [ Function(k, {'a': Scalar(data_t)}, v, returns=data_t) for k, v in mul_pxpy_dict16.items() ] funcs_mulpxpy_16[0].defines = { 'COS_8': np.cos(np.pi / 8), 'SIN_8': np.sin(np.pi / 8) } func_fft_radix_16 = Function( 'fft_radix_16', {'v': Private(data_t)}, """ data_t u[16]; for (int m=0;m<16;m++) u[m] = v[m]; // 8x in-place DFT2 and twiddle (1) DFT2_TWIDDLE(u[0],u[8],mul_p0q8); DFT2_TWIDDLE(u[1],u[9],mul_p1q8); DFT2_TWIDDLE(u[2],u[10],mul_p2q8); DFT2_TWIDDLE(u[3],u[11],mul_p3q8); DFT2_TWIDDLE(u[4],u[12],mul_p4q8); DFT2_TWIDDLE(u[5],u[13],mul_p5q8); DFT2_TWIDDLE(u[6],u[14],mul_p6q8); DFT2_TWIDDLE(u[7],u[15],mul_p7q8); // 8x in-place DFT2 and twiddle (2) DFT2_TWIDDLE(u[0],u[4],mul_p0q4); DFT2_TWIDDLE(u[1],u[5],mul_p1q4); DFT2_TWIDDLE(u[2],u[6],mul_p2q4); DFT2_TWIDDLE(u[3],u[7],mul_p3q4); DFT2_TWIDDLE(u[8],u[12],mul_p0q4); DFT2_TWIDDLE(u[9],u[13],mul_p1q4); DFT2_TWIDDLE(u[10],u[14],mul_p2q4); DFT2_TWIDDLE(u[11],u[15],mul_p3q4); // 8x in-place DFT2 and twiddle (3) DFT2_TWIDDLE(u[0],u[2],mul_p0q2); DFT2_TWIDDLE(u[1],u[3],mul_p1q2); DFT2_TWIDDLE(u[4],u[6],mul_p0q2); DFT2_TWIDDLE(u[5],u[7],mul_p1q2); DFT2_TWIDDLE(u[8],u[10],mul_p0q2); DFT2_TWIDDLE(u[9],u[11],mul_p1q2); DFT2_TWIDDLE(u[12],u[14],mul_p0q2); DFT2_TWIDDLE(u[13],u[15],mul_p1q2); // 8x DFT2 and store (reverse binary permutation) v[0] = u[0] + u[1]; v[1] = u[8] + u[9]; v[2] = u[4] + u[5]; v[3] = u[12] + u[13]; v[4] = u[2] + u[3]; v[5] = u[10] + u[11]; v[6] = u[6] + u[7]; v[7] = u[14] + u[15]; v[8] = u[0] - u[1]; v[9] = u[8] - u[9]; v[10] = u[4] - u[5]; v[11] = u[12] - u[13]; v[12] = u[2] - u[3]; v[13] = u[10] - u[11]; v[14] = u[6] - u[7]; v[15] = u[14] - u[15]; """, defines={ 'DFT2_TWIDDLE(a,b,t)': '{ data_t tmp = t(a-b); a += b; b = tmp; }' }) funcs_helpers = [func_mul1] + funcs_mulpxpy_16 return funcs_helpers, func_fft_radix_16
# work group size: 'T': (T := min(int(N / R), conf.global_mem_cacheline_size)), 'CW': (CW := conf.global_mem_cacheline_size), # Coalescing width, 'N_INPUT': conf. size_data_in_first_iteration, # in_buffer length which might not be power of two 'ITERATION_MAX': self. iteration_max, # in_buffer length which might not be power of two } funcs_radix, defines_radices = get_funcs_radixes(radixes, data_t) defines = {**defines, **defines_radices} # [1]: The expand() function can be thought of as inserting a dimension of length N2 after the first # dimension of length N1 in a linearized index. func_expand = Function('expand', { 'idxL': Scalar(Types.int), 'N1': Scalar(Types.int), 'N2': Scalar(Types.int) }, """ return (int)(idxL/N1)*N1*N2 + (idxL%N1); """, returns=Types.int) # float2* v, int R, int idxD, int incD, int idxS, int incS func_exchange = Function('exchange', { 'v': Private(data_t), 'idxD': Scalar(Types.int), 'incD': Scalar(Types.int), 'idxS': Scalar(Types.int), 'incS': Scalar(Types.int), 'shared': Local(real_t),