def codegen(context, builder, sig, args): out_str_arr, in_str_arr = args in_string_array = context.make_helper(builder, string_array_type, in_str_arr) out_string_array = context.make_helper(builder, string_array_type, out_str_arr) cgutils.memcpy(builder, out_string_array.data, in_string_array.data, in_string_array.num_total_chars) return context.get_dummy_value()
def unicode_to_bytes_cast(context, builder, fromty, toty, val): uni_str = cgutils.create_struct_proxy(fromty)(context, builder, value=val) src1 = builder.bitcast(uni_str.data, ir.IntType(8).as_pointer()) notkind1 = builder.icmp_unsigned('!=', uni_str.kind, ir.Constant(uni_str.kind.type, 1)) src_length = uni_str.length with builder.if_then(notkind1): context.call_conv.return_user_exc( builder, ValueError, ("cannot cast higher than 8-bit unicode_type to bytes", )) bstr = _make_constant_bytes(context, builder, src_length) cgutils.memcpy(builder, bstr.data, src1, bstr.nitems) return bstr
def codegen(context, builder, sig, args): out_str_arr, in_str_arr, ind = args in_string_array = context.make_helper(builder, string_array_type, in_str_arr) out_string_array = context.make_helper(builder, string_array_type, out_str_arr) in_offsets = builder.bitcast(in_string_array.offsets, lir.IntType(32).as_pointer()) out_offsets = builder.bitcast(out_string_array.offsets, lir.IntType(32).as_pointer()) ind_p1 = builder.add(ind, context.get_constant(types.intp, 1)) cgutils.memcpy(builder, out_offsets, in_offsets, ind_p1) cgutils.memcpy(builder, out_string_array.data, in_string_array.data, builder.load(builder.gep(in_offsets, [ind]))) return context.get_dummy_value()
def charseq_to_bytes(context, builder, fromty, toty, val): bstr = _make_constant_bytes(context, builder, val.type.count) rawptr = cgutils.alloca_once_value(builder, value=val) ptr = builder.bitcast(rawptr, bstr.data.type) cgutils.memcpy(builder, bstr.data, ptr, bstr.nitems) return bstr
def build_gufunc_kernel(library, ctx, innerfunc, sig, inner_ndim): """Wrap the original CPU gufunc with a parallel dispatcher. Args ---- ctx numba's codegen context innerfunc llvm function of the original CPU gufunc sig type signature of the gufunc inner_ndim inner dimension of the gufunc Details ------- Generate a function of the following signature: void ufunc_kernel(char **args, npy_intp *dimensions, npy_intp* steps, void* data) Divide the work equally across all threads and let the last thread take all the left over. """ # Declare types and function byte_t = lc.Type.int(8) byte_ptr_t = lc.Type.pointer(byte_t) intp_t = ctx.get_value_type(types.intp) fnty = lc.Type.function(lc.Type.void(), [lc.Type.pointer(byte_ptr_t), lc.Type.pointer(intp_t), lc.Type.pointer(intp_t), byte_ptr_t]) wrapperlib = ctx.codegen().create_library('parallelufuncwrapper') mod = wrapperlib.create_ir_module('parallel.gufunc.wrapper') lfunc = mod.add_function(fnty, name=".kernel." + str(innerfunc)) bb_entry = lfunc.append_basic_block('') # Function body starts builder = lc.Builder(bb_entry) args, dimensions, steps, data = lfunc.args # Release the GIL (and ensure we have the GIL) # Note: numpy ufunc may not always release the GIL; thus, # we need to ensure we have the GIL. pyapi = ctx.get_python_api(builder) gil_state = pyapi.gil_ensure() thread_state = pyapi.save_thread() # Distribute work total = builder.load(dimensions) ncpu = lc.Constant.int(total.type, NUM_THREADS) count = builder.udiv(total, ncpu) count_list = [] remain = total for i in range(NUM_THREADS): space = cgutils.alloca_once(builder, intp_t, size=inner_ndim + 1) cgutils.memcpy(builder, space, dimensions, count=lc.Constant.int(intp_t, inner_ndim + 1)) count_list.append(space) if i == NUM_THREADS - 1: # Last thread takes all leftover builder.store(remain, space) else: builder.store(count, space) remain = builder.sub(remain, count) # Array count is input signature plus 1 (due to output array) array_count = len(sig.args) + 1 # Get the increment step for each array steps_list = [] for i in range(array_count): ptr = builder.gep(steps, [lc.Constant.int(lc.Type.int(), i)]) step = builder.load(ptr) steps_list.append(step) # Get the array argument set for each thread args_list = [] for i in range(NUM_THREADS): space = builder.alloca(byte_ptr_t, size=lc.Constant.int(lc.Type.int(), array_count)) args_list.append(space) for j in range(array_count): # For each array, compute subarray pointer dst = builder.gep(space, [lc.Constant.int(lc.Type.int(), j)]) src = builder.gep(args, [lc.Constant.int(lc.Type.int(), j)]) baseptr = builder.load(src) base = builder.ptrtoint(baseptr, intp_t) multiplier = lc.Constant.int(count.type, i) offset = builder.mul(steps_list[j], builder.mul(count, multiplier)) addr = builder.inttoptr(builder.add(base, offset), baseptr.type) builder.store(addr, dst) # Declare external functions add_task_ty = lc.Type.function(lc.Type.void(), [byte_ptr_t] * 5) empty_fnty = lc.Type.function(lc.Type.void(), ()) add_task = mod.get_or_insert_function(add_task_ty, name='numba_add_task') synchronize = mod.get_or_insert_function(empty_fnty, name='numba_synchronize') ready = mod.get_or_insert_function(empty_fnty, name='numba_ready') # Add tasks for queue; one per thread as_void_ptr = lambda arg: builder.bitcast(arg, byte_ptr_t) # Note: the runtime address is taken and used as a constant in the function. fnptr = ctx.get_constant(types.uintp, innerfunc).inttoptr(byte_ptr_t) for each_args, each_dims in zip(args_list, count_list): innerargs = [as_void_ptr(x) for x in [each_args, each_dims, steps, data]] builder.call(add_task, [fnptr] + innerargs) # Signal worker that we are ready builder.call(ready, ()) # Wait for workers builder.call(synchronize, ()) # Release the GIL pyapi.restore_thread(thread_state) pyapi.gil_release(gil_state) builder.ret_void() wrapperlib.add_ir_module(mod) wrapperlib.add_linking_library(library) return wrapperlib.get_pointer_to_function(lfunc.name), lfunc.name
def build_gufunc_kernel(library, ctx, innerfunc, sig, inner_ndim): """Wrap the original CPU gufunc with a parallel dispatcher. Args ---- ctx numba's codegen context innerfunc llvm function of the original CPU gufunc sig type signature of the gufunc inner_ndim inner dimension of the gufunc Details ------- Generate a function of the following signature: void ufunc_kernel(char **args, npy_intp *dimensions, npy_intp* steps, void* data) Divide the work equally across all threads and let the last thread take all the left over. """ # Declare types and function byte_t = lc.Type.int(8) byte_ptr_t = lc.Type.pointer(byte_t) intp_t = ctx.get_value_type(types.intp) fnty = lc.Type.function(lc.Type.void(), [ lc.Type.pointer(byte_ptr_t), lc.Type.pointer(intp_t), lc.Type.pointer(intp_t), byte_ptr_t ]) wrapperlib = ctx.codegen().create_library('parallelufuncwrapper') mod = wrapperlib.create_ir_module('parallel.gufunc.wrapper') lfunc = mod.add_function(fnty, name=".kernel." + str(innerfunc)) bb_entry = lfunc.append_basic_block('') # Function body starts builder = lc.Builder(bb_entry) args, dimensions, steps, data = lfunc.args # Release the GIL (and ensure we have the GIL) # Note: numpy ufunc may not always release the GIL; thus, # we need to ensure we have the GIL. pyapi = ctx.get_python_api(builder) gil_state = pyapi.gil_ensure() thread_state = pyapi.save_thread() # Distribute work total = builder.load(dimensions) ncpu = lc.Constant.int(total.type, NUM_THREADS) count = builder.udiv(total, ncpu) count_list = [] remain = total for i in range(NUM_THREADS): space = cgutils.alloca_once(builder, intp_t, size=inner_ndim + 1) cgutils.memcpy(builder, space, dimensions, count=lc.Constant.int(intp_t, inner_ndim + 1)) count_list.append(space) if i == NUM_THREADS - 1: # Last thread takes all leftover builder.store(remain, space) else: builder.store(count, space) remain = builder.sub(remain, count) # Array count is input signature plus 1 (due to output array) array_count = len(sig.args) + 1 # Get the increment step for each array steps_list = [] for i in range(array_count): ptr = builder.gep(steps, [lc.Constant.int(lc.Type.int(), i)]) step = builder.load(ptr) steps_list.append(step) # Get the array argument set for each thread args_list = [] for i in range(NUM_THREADS): space = builder.alloca(byte_ptr_t, size=lc.Constant.int(lc.Type.int(), array_count)) args_list.append(space) for j in range(array_count): # For each array, compute subarray pointer dst = builder.gep(space, [lc.Constant.int(lc.Type.int(), j)]) src = builder.gep(args, [lc.Constant.int(lc.Type.int(), j)]) baseptr = builder.load(src) base = builder.ptrtoint(baseptr, intp_t) multiplier = lc.Constant.int(count.type, i) offset = builder.mul(steps_list[j], builder.mul(count, multiplier)) addr = builder.inttoptr(builder.add(base, offset), baseptr.type) builder.store(addr, dst) # Declare external functions add_task_ty = lc.Type.function(lc.Type.void(), [byte_ptr_t] * 5) empty_fnty = lc.Type.function(lc.Type.void(), ()) add_task = mod.get_or_insert_function(add_task_ty, name='numba_add_task') synchronize = mod.get_or_insert_function(empty_fnty, name='numba_synchronize') ready = mod.get_or_insert_function(empty_fnty, name='numba_ready') # Add tasks for queue; one per thread as_void_ptr = lambda arg: builder.bitcast(arg, byte_ptr_t) # Note: the runtime address is taken and used as a constant in the function. fnptr = ctx.get_constant(types.uintp, innerfunc).inttoptr(byte_ptr_t) for each_args, each_dims in zip(args_list, count_list): innerargs = [ as_void_ptr(x) for x in [each_args, each_dims, steps, data] ] builder.call(add_task, [fnptr] + innerargs) # Signal worker that we are ready builder.call(ready, ()) # Wait for workers builder.call(synchronize, ()) # Release the GIL pyapi.restore_thread(thread_state) pyapi.gil_release(gil_state) builder.ret_void() wrapperlib.add_ir_module(mod) wrapperlib.add_linking_library(library) return wrapperlib.get_pointer_to_function(lfunc.name), lfunc.name