Пример #1
0
 def __init__(self, in_buffer: Array, axes_order: Tuple[int, ...]):
     self.axes_order = axes_order
     shape_out = tuple([list(in_buffer.shape)[i] for i in axes_order])
     self.out_buffer = empty(shape_out, in_buffer.dtype)
     self.in_buffer = in_buffer
     self.knl = Kernel(name='transpose',
                       args={
                           'in_buffer': Global(self.in_buffer,
                                               read_only=True),
                           'out_buffer': Global(self.out_buffer)
                       },
                       body=[
                           """
                             int i_in = ${i_in};
                             int i_out = ${i_out};
                             out_buffer[i_out] = in_buffer[i_in];                       
                            """
                       ],
                       replacements={
                           'i_in':
                           self._command_for_input_address_computation(),
                           'i_out':
                           self._command_for_output_address_computation()
                       },
                       global_size=self.in_buffer.shape).compile()
Пример #2
0
 def _get_cl_program(self) -> Program:
     knl = Kernel(name='sum_along_axis',
                  args={
                      'in_buffer': Global(self.in_buffer),
                      'axis': Scalar(Types.int(self.axis)),
                      'out_buffer': Global(self.out_buffer)
                  },
                  body=[
                      """
              buff_t sum = (buff_t) 0;
              for(int i=0; i<${size_input_axis}; i++){// i == glob_id_axis
                 sum+=in_buffer[${addr_in}];
              }                 
              out_buffer[${addr}] = sum;
              """
                  ],
                  replacements={
                      'size_input_axis':
                      self.in_buffer.shape[self.axis],
                      'addr':
                      Helpers.command_compute_address(self.out_buffer.ndim),
                      'addr_in':
                      self._command_compute_address_in()
                  },
                  global_size=self.out_buffer.shape)
     type_defs = {'buff_t': self.in_buffer.dtype}
     return Program(type_defs=type_defs, kernels=[knl])
Пример #3
0
def test_memoize_kernel():
    # thread = Thread(profile=True)
    ary_a = np.ones(int(1e3))
    ary_b = np.zeros(ary_a.shape)
    ary_a_buffer = to_device(ary_a)
    ary_b_buffer = to_device(ary_b)
    n_recompilations = 100
    for i in range(n_recompilations + 1):
        kernels = []
        for j in range(10):
            some_knl = Kernel(
                f'some_knl_{j}', {
                    'ary_a': Global(ary_a_buffer),
                    'ary_b': Global(ary_b_buffer)
                }, """
                     ary_b[get_global_id(0)] = ary_a[get_global_id(0)];
                     """)
            kernels.append(some_knl)
        Program(kernels=kernels).compile()
        some_knl(global_size=ary_a.shape)
        if i == 1:
            t = time.time()
    time_per_recompile = (time.time() - t) / n_recompilations
    # thread.queue.get_profiler().show_histogram_cumulative_kernel_times()
    print(time_per_recompile)
    assert time_per_recompile < 0.001  # less than 1 ms overhead per recompilation achieved through caching
Пример #4
0
    def get_fft_stage_func(self):
        replacements = {'fft_radix_R': f'fft_radix_{self.radix}'}
        func_fft_iteration = Function(
            'fft_stage',
            {
                # thread index: T radix R fft ops are done in parallel
                't':
                Scalar(Types.int),
                # thread block index: identifies block of N/(R*T) blocks.
                'b':
                Scalar(Types.int),
                'Ns':
                Scalar(Types.int),
                'data0':
                Local(self.data_t) if self.b_local_memory else Global(
                    self.data_t, read_only=True),
                'data1':
                Local(self.data_t)
                if self.b_local_memory else Global(self.data_t),
                'shared':
                Local(self.real_t),
                'direction':
                Scalar(Types.int),
                'iteration':
                Scalar(Types.int)
            },
            """
        int j = b*T + t; // as proposed in [1, p.3, text section A]
        private data_t v[R];        
        real_t angle = -2*PI*(j % Ns)/(Ns*R);
        for(int r=0; r<R; r++){
            int idxS = j + r* N/R; // idxS=idxSource
            v_from_data0(v, data0, idxS, r, direction,iteration);
            real_t cos_angle = cos(r*angle);
            real_t sin_angle = sin(r*angle);
            v[r] = (data_t)(v[r].s0*cos_angle-v[r].s1 *sin_angle,
                            v[r].s0*sin_angle+v[r].s1 *cos_angle);
        }
        ${fft_radix_R}(v);

        if(Ns>=CW){ // todo: remove condition and store as separate kernel
            // changed line 27 of [1, Figure 2] to work with global dimensions of this class:
            int offset = expand(j, Ns, R); 
            for(int r=0; r<R; r++){
                int idxD = offset + r*Ns; // idxD=idxDestination
                data_1_from_v(v, data1, idxD, r, direction, iteration);
            }
        }else{ // According to [1, p.4], such that global memory writes are coalesced
            // !! mistake in [1] where *Ns is missing: idxD = (int)(t/Ns)*R + (t%Ns); !!
            int idxD = (int)(t/Ns)*Ns*R + (t%Ns);
            exchange( v, idxD, Ns, t, T, shared);
            int offset = b*R*T+ t; 
            for( int r=0; r<R; r++ ){
                idxD = offset + r*T;
                data_1_from_v(v, data1, idxD, r, direction, iteration);
            }  
        }
                   """,
            replacements=replacements)
        return func_fft_iteration
Пример #5
0
 def __init__(self, mode='a', b_create_kernel_file: bool = True):
     self.mode = mode
     self.buff = zeros((10, ), Types.short)
     self.data_t = self.buff.dtype
     func = Function('plus_one', {
         'buffer': Global(self.buff.dtype),
         'idx': Scalar(Types.int)
     }, [
         """
                               return buffer[idx]+${some_integer};
                               """
     ], {'some_integer': 5},
                     returns=Types.float)
     knl = Kernel('some_operation', {
         'buff': Global(self.buff),
         'number': Scalar(Types.short(3.0))
     }, [
         """
                             data_t factor = convert_${data_t}(1.3);
                             buff[get_global_id(0)] = plus_one(buff, get_global_id(0)) + SOME_CONSTANT*factor;
                             """
     ],
                  replacements={'data_t': c_name_from_dtype(self.data_t)},
                  global_size=self.buff.shape)
     defines = {'SOME_CONSTANT': 6}
     type_defs = {'data_t': self.data_t}
     self.program = Program(defines=defines,
                            type_defs=type_defs,
                            functions=[func],
                            kernels=[knl]).compile()
     self.knl = knl
Пример #6
0
def test_get_refreshed_argument_of_memoized_kernel():
    for i in range(10):
        ary_a = np.ones(100) + i
        ary_b = np.zeros(100)
        some_knl = Kernel(
            'some_knl', {
                'ary_a': Global(to_device(ary_a)),
                'ary_b': Global(to_device(ary_b))
            }, """
                 ary_b[get_global_id(0)] = ary_a[get_global_id(0)];
                 """).compile()
        some_knl(global_size=ary_a.shape)
    assert np.all(some_knl.ary_b.get() == ary_a)
Пример #7
0
 def _get_kernel(stage_builder: FftStageBuilder,
                 iteration,
                 radix,
                 data_in,
                 data_out,
                 emulate=False):
     if iteration == 0:  # data0 is in_buffer, whose length might not be power of two
         offset_data_in = 'get_global_id(0)*N_INPUT'
     else:
         offset_data_in = 'get_global_id(0)*N'
     stage_builder.radix = radix
     funcs, type_defs, d, shared_mem_fft_stage = stage_builder.get_funcs_for_all_stages(
         [radix])
     funcs.append(stage_builder.get_fft_stage_func())
     d['M'] = (M :=
               data_in.shape[0])  # number of FFTs to perform simultaneously
     knl_gpu_fft = Kernel(
         'gpu_fft',
         {
             'Ns': Scalar(Types.int),
             # In each iteration, the algorithm can be thought of combining the radix R FFTs on
             # subsequences of length Ns into the FFT of a new sequence of length RNs by
             # performing an FFT of length R on the corresponding elements of the subsequences.
             'data_in': Global(data_in),
             'data_out': Global(data_out),
             'direction': Scalar(Types.int),
             'iteration': Scalar(Types.int)
         },
         """
              ${shared_mem_fft_stage}
              int t = get_local_id(2); // thread index 
              int b = get_global_id(1); // thread block index
              fft_stage(t, b, Ns, 
                        data_in  +${offset_data_in},
                        data_out +get_global_id(0)*N,
                        shared,direction, iteration);""",
         replacements={
             'offset_data_in': offset_data_in,
             'shared_mem_fft_stage': shared_mem_fft_stage
         },
         global_size=(d['M'], max(1,
                                  int(d['N'] / (d['R'] * d['T']))), d['T']),
         local_size=(1, 1, d['T']))
     program = Program(
         funcs, [knl_gpu_fft], defines=d, type_defs=type_defs).compile(
             context=data_in.context,
             emulate=emulate,
             file=Program.get_default_dir_pycl_kernels().joinpath(
                 f'fft_{iteration}_{stage_builder.radix}'))
     return program.gpu_fft
def test_debug_c_code_with_unary_increment_operation_inside_of_array():
    buff_cl = zeros((6, 1), Types.short)
    knl = Kernel('knl', {'buff': Global(buff_cl)},
                 """
        int number = -1;
        number++;
        buff[number++] = 1;
        buff[number] = 2;
        number = 0;
        buff[2+ number--] = 3;
        buff[3+ ++number] = 4;
        buff[5 + --number] = 5;
        int count = 0;
        for(int i=1; i<3; i++){
            count = count + i;
        }        
        buff[5] = count;
    """,
                 global_size=(1, ))
    compiled_cl = knl.compile(emulate=False)
    compiled_cl(buff=buff_cl)
    buff_py = zeros((6, 1), Types.short)
    compiled_py = knl.compile(
        emulate=True, file=Path(__file__).parent.joinpath('py_cl_kernels/knl'))
    compiled_py(buff=buff_py)
    assert np.all(buff_py.get() == buff_cl.get())
def test_debug_kernel_with_barriers():
    buff = np.zeros(shape=(2, 4)).astype(Types.int)
    mem_buffer = to_device(buff)
    knl = Kernel('knl', {'mem_glob': Global(mem_buffer)},
                 """
        __local int mem[2];
        mem[0]=0;
        mem[1]=0;
        mem[get_local_id(1)] = get_local_id(1);
        barrier(CLK_LOCAL_MEM_FENCE);
        mem[get_local_id(1)] = mem[1];
        //barrier(CLK_GLOBAL_MEM_FENCE);
        mem_glob[get_global_id(0)*get_global_size(1)+get_global_id(1)] = mem[get_local_id(1)];
    """,
                 global_size=(2, 4),
                 local_size=(1, 2))
    compiled_cl = knl.compile(
        emulate=False,
        file=Path(__file__).parent.joinpath('py_cl_kernels/knl'))
    compiled_cl()
    mem_buffer_py = zeros_like(mem_buffer)
    compiled_py = knl.compile(
        emulate=True, file=Path(__file__).parent.joinpath('py_cl_kernels/knl'))
    # out[0] = complex64(inp[0].real+out[0].imag*1j) instead of out[0].real=inp[0].real
    compiled_py(mem_glob=mem_buffer_py)
    assert np.all(mem_buffer.get() == mem_buffer_py.get())
 def __init__(self, in_buffer: Array, out_buffer_dtype: np.dtype):
     self.in_buffer = in_buffer
     self.out_buffer = empty(in_buffer.shape, out_buffer_dtype, in_buffer.queue)
     knl = Kernel(name='type',
                  args={'in_buffer': Global(self.in_buffer, read_only=True),
                        'out_buffer': Global(self.out_buffer)},
                  body=["""
                                int addr_in = ${command_addr_in};
                                int addr_out = ${command_addr_out};
                                out_buffer[addr_out]=convert_${buff_out_t}(in_buffer[addr_in]);
                                """],
                  replacements={'command_addr_in': Helpers.command_compute_address(self.in_buffer.ndim),
                                'command_addr_out': Helpers.command_compute_address(self.out_buffer.ndim),
                                'buff_out_t': c_name_from_dtype(self.out_buffer.dtype)},
                  global_size=self.in_buffer.shape)
     self.program = Program(kernels=[knl]).compile(context=in_buffer.context, emulate=False)
Пример #11
0
def test_multiple_command_queues():
    queue1 = create_queue(device_id=0)
    queue2 = create_queue(context=queue1.context)
    ary_a = to_device(np.ones(100000) + 1, queue1)
    ary_b = to_device(np.zeros(100000), queue1)
    some_knl = Kernel('some_knl', {
        'ary_a': Global(ary_a),
        'ary_b': Global(ary_b)
    },
                      """
             ary_b[get_global_id(0)] += ary_a[get_global_id(0)];
             """,
                      global_size=ary_a.shape).compile(queue2.context)
    some_knl(queue=queue2)
    # thread2.queue.finish()
    some_knl(queue=queue1)
    test = 0
Пример #12
0
 def __init__(self, b_create_kernel_file: bool = True):
     self.buff = zeros((10, ), Types.short)
     self.knl = Kernel('some_operation', {
         'buff': Global(self.buff),
         'number': Scalar(Types.short(1))
     }, [
         """
                             buff[get_global_id(0)] = number;
                             """
     ],
                       global_size=self.buff.shape).compile()
Пример #13
0
def test_two_input_integer_functions(name, dtype):
    a_cl = to_device(np.ones((10, ), dtype))
    a_emulation = to_device(np.ones((10, ), dtype))
    knl = Kernel(f'knl_{name}', {
        'a': Global(a_cl),
        'num': Scalar(dtype(0))
    },
                 f'a[get_global_id(0)]={name}(a[get_global_id(0)], num);',
                 global_size=a_cl.shape)
    knl.compile()(a=a_cl)
    knl.compile(emulate=True)(a=a_emulation)
    assert np.all(a_cl.get() == a_emulation.get())
Пример #14
0
def test_access_complex_variable():
    buff = np.array([0.5]).astype(Types.cfloat)
    buff_in = to_device(buff)
    buff_out = zeros_like(buff_in)
    knl = Kernel('knl', {
        'inp': Global(buff_in),
        'out': Global(buff_out)
    },
                 """
        out[get_global_id(0)].real = inp[get_global_id(0)].real; 
    """,
                 global_size=(1, ))
    compiled_cl = knl.compile(
        emulate=False,
        file=Path(__file__).parent.joinpath('py_cl_kernels/knl'))
    compiled_cl()
    buff_out_py = zeros_like(buff_in)
    compiled_py = knl.compile(
        emulate=True, file=Path(__file__).parent.joinpath('py_cl_kernels/knl'))
    # out[0] = complex64(inp[0].real+out[0].imag*1j) instead of out[0].real=inp[0].real
    compiled_py(out=buff_out_py)
    assert np.all(buff_out.get() == buff_out_py.get())
Пример #15
0
def test_different_c_operations_at_once():
    ary = zeros((2, ), Types.int)
    knl = Kernel('knl_multiple_c_operations', {
        'ary': Global(ary)
    },
                 """int a = 1;
                    int b = 2;
                    dtype val; // test variable definition without assignment
                    dtype *ptr1; // test pointer definition 
                    global dtype *ptr2; // test global pointer definition 
                    ary[get_global_id(0)] = a>get_global_id(0) ? a : b;
                 """,
                 global_size=ary.shape,
                 type_defs={
                     'dtype': ary.dtype
                 }).compile(emulate=True)
    knl()
    assert np.allclose(ary.get(), np.array([1, 2]).astype(ary.dtype))
Пример #16
0
 def eval_code(emulate=False):
     data = to_device(np.array([0]).astype(Types.char))
     knl = Kernel('knl_test_for_loop', {
         'data': Global(data)
     },
                  """
         ${header}{
             data[0]+=i;
         }
     """,
                  replacements={
                      'header': header
                  },
                  global_size=data.shape).compile(emulate=emulate)
     knl()
     get_current_queue().finish()
     res = knl.data.get()
     return res
Пример #17
0
def test_macro_with_arguments():
    defines = {
        'FUNC(a,b,c)': '{ int tmp = c(a-b); a += b + tmp; }'
    }  # this is a macro with arguments
    ary = zeros((2, ), Types.int)
    func_add_two = Function('add_two', {'a': Scalar(Types.int)},
                            'return a + 2;',
                            returns=Types.int)
    knl = Kernel('knl_macro_func', {'ary': Global(ary)},
                 """
               int a = 1;
               int b = 2;
               FUNC(a, b, add_two)
               ary[get_global_id(0)] = a;
               """,
                 defines=defines,
                 global_size=ary.shape)
    Program([func_add_two], [knl]).compile().knl_macro_func()
    assert np.allclose(ary.get(), np.array([4, 4]).astype(ary.dtype))
Пример #18
0
def test_nested_local_barrier_inside_function():
    func_nested = Function('nested_func', {
        'ary': Global(Types.int),
        'shared': Local(Types.int)
    },
                           """
               shared[get_global_id(0)] = ary[get_global_id(0)];
               barrier(CLK_LOCAL_MEM_FENCE);
               return shared[(get_global_id(0)+1)%2] ;
               """,
                           returns=Types.int)
    func_parent = Function('parent',
                           func_nested.args,
                           """
               return nested_func(ary, shared);
               """,
                           returns=Types.int)

    ary = to_device((ary_np := np.array([1, 2]).astype(Types.int)))
Пример #19
0
def test_add_functions_inside_function_or_kernel_definition():
    ary_a = to_device(np.ones(100))
    fnc_add3 = Function('add_three', {'a': Scalar(Types.int)},
                        'return a + 3;',
                        returns=Types.int)
    fnc_add5 = Function('add_five', {'a': Scalar(Types.int)},
                        """
             return add_three(a)+2;
             """,
                        functions=[fnc_add3],
                        returns=Types.int)
    some_knl = Kernel('some_knl', {'ary_a': Global(ary_a)},
                      """
             ary_a[get_global_id(0)] = add_five(ary_a[get_global_id(0)]);
             """,
                      global_size=ary_a.shape,
                      functions=[fnc_add5])
    functions = [
    ]  # funcitons defined here have higher proiority in case of name conflicts
    Program(functions=functions, kernels=[some_knl]).compile()
    some_knl()
    assert ary_a.get()[0] == 6
Пример #20
0
 def run(emulate=False):
     ary = to_device(np.ones(10).astype(data_t))
     local_mem = LocalArray(
         dtype=data_t,
         shape=5)  # 5 is to to test that local array argument is changed
     knl = Kernel('knl_local_arg', {
         'ary': Global(ary),
         'local_mem': local_mem
     },
                  """
            int offset = get_group_id(0)*get_local_size(0);
            for(int i=0; i<5; i++) local_mem[i] = ary[offset + i];
            barrier(CLK_LOCAL_MEM_FENCE);
            data_t sum = (data_t)(0);
            for(int i=0; i<5; i++) sum+=local_mem[i];
            ary[get_global_id(0)] = sum;
                  """,
                  type_defs={'data_t': data_t},
                  global_size=ary.shape,
                  local_size=(5, ))
     local_mem = LocalArray(dtype=data_t, shape=5)
     knl.compile(emulate=emulate)(local_mem=local_mem)
     return ary.get()
    def __init__(self, in_buffer: Array,
                 region_in: TypeSliceFormatCopyArrayRegion = None,
                 out_buffer: Array = None,
                 region_out: TypeSliceFormatCopyArrayRegion = None):
        """

        :param in_buffer:
        :param region_in: e.g. region_in=((0,0,1,2),(1,1,3,2)) selects the region_in of array a, like numpy would do with
        a[0:1:2,1:3:2] where 2 is the step width. The first element of tuple selects the axis.
        :param out_buffer: target buffer where data from in_buffer is being copied. (optional)
        :param region_out: specifies the region of out buffer memory where in_buffer data is copied (optional)
        """
        _region_in_original = copy(region_in)  # for debug purposes
        _region_out_original = copy(region_out)  # for debug purposes

        if region_in is not None:
            region_in = self._deal_with_incomplete_regions(region_in, in_buffer)

        if out_buffer is not None and region_out is not None:
            region_out = self._deal_with_incomplete_regions(region_out, out_buffer)

        if region_in is not None:
            region_in = self._deal_with_none_in_stop(region_in, in_buffer)
        if region_out is not None and out_buffer is not None:
            region_out = self._deal_with_none_in_stop(region_out, out_buffer)

        if region_in is not None:
            region_in = self._deal_with_negative_region(region_in, in_buffer)
        if region_out is not None and out_buffer is not None:
            region_out = self._deal_with_negative_region(region_out, out_buffer)

        self.in_buffer = in_buffer
        if region_in is None:
            self.region_in = [(0, self.in_buffer.shape[i_axis], 1) for i_axis in range(self.in_buffer.ndim)]
        else:
            self.region_in = region_in

        if out_buffer is None and region_out is None:
            shape = [ax[1] - ax[0] for ax in self.region_in]
            self.out_buffer = empty(tuple(shape), dtype=self.in_buffer.dtype)
            self.region_out = [(0, i, 1) for i in shape]  # (tuple([0]*len(shape)),shape)
        elif out_buffer is not None and region_out is not None:
            self.out_buffer = out_buffer
            self.region_out = region_out
        else:
            raise ValueError('Case of input argument combination not supported')

        if self.in_buffer.dtype != self.out_buffer.dtype:
            raise ValueError('in and out buffer must be of same type')

        self.shape_region_out = tuple([ax[1] - ax[0] for ax in self.region_out])
        self.in_buffer = in_buffer

        self.copy_array_region = Kernel(name='copy_array_region',
                                        args={'in_buffer': Global(self.in_buffer, read_only=True),
                                              'out_buffer': Global(self.out_buffer, )},
                                        body=["""
                                  int addr_in = ${command_addr_in};
                                  int addr_out = ${command_addr_out};
                                  out_buffer[addr_out]=in_buffer[addr_in];
                                  """],
                                        replacements={'command_addr_in': self._command_for_addr_in_computation(),
                                                      'command_addr_out': self._command_for_addr_out_computation()},
                                        global_size=self.shape_region_out).compile()
def cl_set(array: Array, region: TypeSliceFormatCopyArrayRegion, value):
    """
    example usage:
    set slice of array with scalar value
    val = 1
    cl_set(ary, Slice[:,2:3], val)

    set slice of array with equally shaped numpy array like the slice
    some_np_array = np.array([[3,4])
    cl_set(ary, Slice[1:2,2:3], some_np_array)


    :param array:
    :param region:
    :param value:
    :return:
    """
    # todo test if array c contiguous
    region_arg = region
    # if slice is contiguous block of memory set it as
    # _buffer_np = np.zeros_like(add_symbols_memory_initialization.out_buffer)
    # _buffer_np[:, memory: -memory] = mapper.alphabet[0]
    # add_symbols_memory_initialization.out_buffer.set(_buffer_np)
    region = CopyArrayRegion._deal_with_incomplete_regions(region_arg, array)
    region = CopyArrayRegion._deal_with_none_in_stop(region, array)
    region = CopyArrayRegion._deal_with_negative_region(region, array)

    # test if requested region is
    for axis, _slice in enumerate(region):
        step_width = _slice[2]
        if abs(_slice[0] * step_width) > array.shape[axis] or abs(_slice[1] * step_width) > array.shape[axis]:
            raise ValueError('Slicing out of array bounds')
    if any([(part[0] - part[1]) == 0 for part in region]):  # check that there is no empty slice
        return
    global_size = np.product([part[1] - part[0] for part in region])
    target_shape = to_device(np.array(array.shape).astype(Types.int))
    offset_target = to_device(np.array([part[0] for part in region]).astype(Types.int))
    source_shape = to_device(np.array([part[1] - part[0] for part in region]).astype(Types.int))
    source_n_dims = len(source_shape)

    if isinstance(value, np.ndarray):
        source = to_device(value.astype(array.dtype))
        arg_source = Global(source)
        code_source = 'source[get_global_id(0)]'
    else:
        arg_source = Scalar(array.dtype)
        source = value
        code_source = 'source'

    knl = Kernel('set_cl_array',
                 {'target': Global(array),
                  'target_shape': Global(target_shape),
                  'offset_target': Global(offset_target),
                  'source': arg_source,
                  'source_shape': Global(source_shape),
                  'source_n_dims': Scalar(Types.int)},
                 """
   // id_source = get_global_id(0)
   // id_source points to element of array source which replaces element with id_target in array target.
   // we need to compute id_target from id_source:
   // we assume c-contiguous addressing like:
   // id_source = id0*s1*s2*s3+id1*s2*s3+id2*s3+id3 (here s refers shape of source array)
   // At first we need to compute individual ids of source array from id_source:
   // id3 = int(gid % s3), temp = (gid-id3)/s3
   // id2 = int(temp % s2), temp = (temp-id2)/s2
   // id1 = int(temp % s1), temp = (temp-id1)/s1
   // id0 = int(temp % s0), temp = (temp-id0)/s1
   // Finally, we can determine the id of the target array and copy element to corresponding position:
   // id_target = (id0*offset0t)*s1t*s2t ... (sxt: shape of target array along dim x)
   int id_target = 0; // to be iteratively computed from global id, slice dimensions and ary dimensions

   int temp = get_global_id(0);
   int prod_source_id_multiplier = 1;
   int prod_target_id_multiplier = 1;

   for(int i=source_n_dims-1; i>=0; i--){ // i=i_axis_source
    int id_source = temp % source_shape[i];
    temp = (int)((temp-id_source)/source_shape[i]);
    prod_source_id_multiplier *= source_shape[i];
    id_target += (offset_target[i]+id_source)*prod_target_id_multiplier;
    prod_target_id_multiplier *= target_shape[i];
   }
   target[id_target] = ${source};
                   """,
                 replacements={'addr': Helpers.command_compute_address(array.ndim),
                               'source': code_source},
                 global_size=(global_size,)
                 ).compile(array.context, emulate=False)
    knl(source=source, source_n_dims=source_n_dims)
Пример #23
0
               shared[get_global_id(0)] = ary[get_global_id(0)];
               barrier(CLK_LOCAL_MEM_FENCE);
               return shared[(get_global_id(0)+1)%2] ;
               """,
                           returns=Types.int)
    func_parent = Function('parent',
                           func_nested.args,
                           """
               return nested_func(ary, shared);
               """,
                           returns=Types.int)

    ary = to_device((ary_np := np.array([1, 2]).astype(Types.int)))
    use_existing_file_for_emulation(False)
    knl = Kernel('some_knl', {
        'ary': Global(ary),
    },
                 """
                __local int shared[2];
                ary[get_global_id(0)] = parent(ary, shared);
                   """,
                 global_size=ary.shape)
    prog = Program([func_nested, func_parent], [knl])
    prog_py = prog.compile(emulate=True)
    prog_cl = prog.compile(emulate=False)
    prog_py.some_knl()

    ary_py = ary.get()
    ary.set(ary_np)
    prog_cl.some_knl()
    ary_cl = ary.get()
Пример #24
0
                             sr[i] = v[r].s0;
                             si[i] = v[r].s1;
                         }
                         barrier(CLK_LOCAL_MEM_FENCE);
                         for(int r=0; r<R; r++ ) {
                             int i = (idxS + r*incS)*STRIDE;
                             v[r] = (data_t)(sr[i], si[i]);
                         }
                          """,
                          defines={'STRIDE': 1})
 func_v_from_data0 = Function(
     'v_from_data0', {
         'v':
         Private(self.data_t),
         'data0':
         Local(self.data_t) if self.b_local_memory else Global(
             self.data_t, read_only=True),
         'idxS':
         Scalar(Types.int),
         'r':
         Scalar(Types.int),
         'direction':
         Scalar(Types.int),
         'iteration':
         Scalar(Types.int)
     }, """
          if(iteration ==0){
              if(idxS<N_INPUT){
                 v[r]=data0[(int)(idxS)];
              }else{
                 v[r]=(data_t)(0.0, 0.0);
              }