class VectorArithDriver2(CDefinition): _name_ = 'vector_arith_driver_2' _argtys_ = [ ('A', C.pointer(C.float)), ('B', C.pointer(C.float)), ('C', C.pointer(C.float)), ('D', C.pointer(C.float)), ('n', C.int), ] def body(self, Aary, Bary, Cary, Dary, n): ''' This version loads element of vector individually. This style generates scalar ld/st instead of vector ld/st. ''' vecarith = self.depends(VectorArith()) a = self.var(floatv4) b = self.var(floatv4) c = self.var(floatv4) elem_per_vec = self.constant(C.int, floatv4.count) with self.for_range(0, n, elem_per_vec) as (outer, i): with self.for_range(elem_per_vec) as (inner, j): a[j] = Aary[i + j] b[j] = Bary[i + j] c[j] = Cary[i + j] r = vecarith(a, b, c) Dary[i:].vector_store(r, align=1) # self.debug(r[0], r[1], r[2], r[3]) self.ret()
def body(self, args, dimensions, steps, data): ufunc_ptr = self.depends(self.FuncDef) fnty = ufunc_ptr.type.pointee arg_ptrs = [] arg_steps = [] for i in range(len(fnty.args)+1): arg_ptrs.append(self.var_copy(args[i])) const_steps = self.var_copy(steps[i]) const_steps.invariant = True arg_steps.append(const_steps) with self.for_range(dimensions[0]) as (loop, item): callargs = [] for i, argty in enumerate(fnty.args): casted = arg_ptrs[i].cast(C.pointer(argty)) callargs.append(casted.load()) arg_ptrs[i].assign(arg_ptrs[i][arg_steps[i]:]) # increment pointer res = ufunc_ptr(*callargs, **dict(inline=True)) retval_ptr = arg_ptrs[-1].cast(C.pointer(fnty.return_type)) retval_ptr.store(res, nontemporal=True) arg_ptrs[-1].assign(arg_ptrs[-1][arg_steps[-1]:]) self.ret()
class VectorArithDriver1(CDefinition): _name_ = 'vector_arith_driver_1' _argtys_ = [ ('A', C.pointer(C.float)), ('B', C.pointer(C.float)), ('C', C.pointer(C.float)), ('D', C.pointer(C.float)), ('n', C.int), ] def body(self, Aary, Bary, Cary, Dary, n): ''' This version uses vector load to fetch array elements as vectors. ''' vecarith = self.depends(VectorArith()) elem_per_vec = self.constant(C.int, floatv4.count) with self.for_range(0, n, elem_per_vec) as (loop, i): # Aary[i:] offset the array at i a = Aary[i:].vector_load(4, align=1) # unaligned vector load b = Bary[i:].vector_load(4, align=1) c = Cary[i:].vector_load(4, align=1) r = vecarith(a, b, c) Dary[i:].vector_store(r, align=1) # self.debug(r[0], r[1], r[2], r[3]) self.ret()
def _generate_argtys(self): argtys = [] for arg in self.in_args: argtys.append((arg.name, C.pointer(arg.type), [ATTR_NO_ALIAS])) if self.returns_value: argtys.append(('out', C.pointer(self.return_type), [ATTR_NO_ALIAS])) else: for arg in self.out_args: argtys.append((arg.name, arg.type, [ATTR_NO_ALIAS])) argtys.append(('n_elements', C.py_ssize_t)) return argtys
class NewAxis(NumbaCDefinition): _name_ = "newaxis" _argtys_ = [ ('out_shape', C.pointer(C.npy_intp)), ('out_strides', C.pointer(C.npy_intp)), ('dst_dim', C.int), ] def body(self, out_shape, out_strides, dst_dim): one, zero = get_constants(self) out_shape[dst_dim] = one out_strides[dst_dim] = zero # self.debug("newaxis in dimension:", dst_dim) self.ret()
def fakeit(self, dtype, data, dimensions, steps): assert len(dimensions) == len(steps) constant = self.parent.constant self.ob_refcnt.assign(constant(C.intp, 1)) type_p = constant(C.py_ssize_t, id(np.ndarray)) self.ob_type.assign(type_p.cast(C.pointer(C.int))) self.base.assign(self.parent.constant_null(C.void_p)) dtype_p = constant(C.py_ssize_t, id(dtype)) self.descr.assign(dtype_p.cast(C.void_p)) self.flags.assign(constant(C.int, _internal.NPY_WRITEABLE)) self.data.assign(data) self.nd.assign(constant(C.int, len(dimensions))) ary_dims = self.parent.array(C.intp, len(dimensions) * 2) ary_steps = ary_dims[len(dimensions):] for i, dim in enumerate(dimensions): ary_dims[i] = dim self.dimensions.assign(ary_dims) # ary_steps = self.parent.array(C.intp, len(steps)) for i, step in enumerate(steps): ary_steps[i] = step self.strides.assign(ary_steps)
class WinThreadAPI(CExternal): '''external declaration of pthread API ''' _calling_convention_ = CC_X86_STDCALL handle_t = C.void_p # lpStartAddress is an LPTHREAD_START_ROUTINE, with the form # DWORD ThreadProc (LPVOID lpdwThreadParam ) CreateThread = Type.function( handle_t, [ C.void_p, # lpThreadAttributes (NULL for default) C.intp, # dwStackSize (0 for default) C.void_p, # lpStartAddress C.void_p, # lpParameter C.int32, # dwCreationFlags (0 for default) C.pointer(C.int32) ]) # lpThreadId (NULL if not required) # Return is WAIT_OBJECT_0 (0x00000000) to indicate the thread exited, # or WAIT_ABANDONED, WAIT_TIMEOUT, WAIT_FAILED for other conditions. WaitForSingleObject = Type.function( C.int32, [ handle_t, # hHandle C.int32 ]) # dwMilliseconds (INFINITE == 0xFFFFFFFF means wait forever) CloseHandle = Type.function(C.int32, [handle_t])
def gen_test_worker(mod): cb = CBuilder.new_function(mod, 'worker', C.void, [C.pointer(C.int)]) pval = cb.args[0] val = pval.load() one = cb.constant(val.type, 1) pval.store(val + one) cb.ret() cb.close()
def body(self, args, dimensions, steps, data): ufunc_ptr = self.depends(self.FuncDef) fnty = ufunc_ptr.type.pointee # void return type implies return by reference in last argument is_void_ret = fnty.return_type.kind == lc.TYPE_VOID argtys = fnty.args[:-1] if is_void_ret else fnty.args arg_ptrs = [] arg_steps = [] for i in range(len(argtys)+1): arg_ptrs.append(self.var_copy(args[i])) const_steps = self.var_copy(steps[i]) const_steps.invariant = True arg_steps.append(const_steps) with self.for_range(dimensions[0]) as (loop, item): callargs = [] for i, argty in enumerate(argtys): if argty.kind == lc.TYPE_POINTER: casted = arg_ptrs[i].cast(argty) callargs.append(casted) else: casted = arg_ptrs[i].cast(C.pointer(argty)) callargs.append(casted.load()) # increment pointer arg_ptrs[i].assign(arg_ptrs[i][arg_steps[i]:]) if is_void_ret: retty = fnty.args[-1] assert retty.kind == lc.TYPE_POINTER retval = self.var(retty.pointee) callargs.append(retval.ref) for i in callargs: print(i) ufunc_ptr(*callargs, **dict(inline=True)) res = retval retval_ptr = arg_ptrs[-1].cast(retty) else: res = ufunc_ptr(*callargs, **dict(inline=True)) retval_ptr = arg_ptrs[-1].cast(C.pointer(fnty.return_type)) retval_ptr.store(res, nontemporal=True) arg_ptrs[-1].assign(arg_ptrs[-1][arg_steps[-1]:]) self.ret()
class IndexAxis(NumbaCDefinition): _name_ = "index" _retty_ = C.char_p _argtys_ = [ ('data', C.char_p), ('in_shape', C.pointer(C.npy_intp)), ('in_strides', C.pointer(C.npy_intp)), ('src_dim', C.npy_intp), ('index', C.npy_intp), ] def body(self, data, in_shape, in_strides, src_dim, index): result = self.var(data.type, name='result') # self.debug("indexing...", src_dim, "stride", in_strides[src_dim]) result.assign(data[in_strides[src_dim] * index:]) self.ret(result)
def body(self, args, dimensions, steps, data): ufunc_ptr = self.depends(self.FuncDef) fnty = ufunc_ptr.type.pointee # void return type implies return by reference in last argument is_void_ret = fnty.return_type.kind == lc.TYPE_VOID argtys = fnty.args[:-1] if is_void_ret else fnty.args arg_ptrs = [] arg_steps = [] for i in range(len(argtys) + 1): arg_ptrs.append(self.var_copy(args[i])) const_steps = self.var_copy(steps[i]) const_steps.invariant = True arg_steps.append(const_steps) with self.for_range(dimensions[0]) as (loop, item): callargs = [] for i, argty in enumerate(argtys): if argty.kind == lc.TYPE_POINTER: casted = arg_ptrs[i].cast(argty) callargs.append(casted) else: casted = arg_ptrs[i].cast(C.pointer(argty)) callargs.append(casted.load()) # increment pointer arg_ptrs[i].assign(arg_ptrs[i][arg_steps[i]:]) if is_void_ret: retty = fnty.args[-1] assert retty.kind == lc.TYPE_POINTER retval = self.var(retty.pointee) callargs.append(retval.ref) for i in callargs: print(i) ufunc_ptr(*callargs, **dict(inline=True)) res = retval retval_ptr = arg_ptrs[-1].cast(retty) else: res = ufunc_ptr(*callargs, **dict(inline=True)) retval_ptr = arg_ptrs[-1].cast(C.pointer(fnty.return_type)) retval_ptr.store(res, nontemporal=True) arg_ptrs[-1].assign(arg_ptrs[-1][arg_steps[-1]:]) self.ret()
def _outer_loop(self, dargs, dimensions, pyarys, steps, data): # implement outer loop innerfunc = self.depends(self.FuncDef) with self.for_range(dimensions[0]) as (loop, idx): args = [] for i, (arg, arg_type) in enumerate(zip(pyarys, innerfunc.handle.args)): if C.pointer(PyArray.llvm_type()) != arg_type.type: # scalar val = arg.data[0:].cast(C.pointer(arg_type.type)).load() args.append(val) else: casted = arg.reference().cast(arg_type.type) args.append(casted) innerfunc(*args) for i, ary in enumerate(pyarys): ary.data.assign(ary.data[steps[i]:])
def gen_vector2d_dist(mod): functype = Type.function(C.float, [C.pointer(Vector2D.llvm_type())]) func = mod.add_function(functype, 'vector2d_dist') cb = CBuilder(func) vec = cb.var(Vector2D, cb.args[0].load()) dist = vec.x * vec.x + vec.y * vec.y cb.ret(dist) cb.close() return func
def gen_vector2d_dist(mod): functype = Type.function(C.float, [C.pointer(Vector2D.llvm_type())]) func = mod.add_function(functype, "vector2d_dist") cb = CBuilder(func) vec = cb.var(Vector2D, cb.args[0].load()) dist = vec.x * vec.x + vec.y * vec.y cb.ret(dist) cb.close() return func
class BasicUFunc(CDefinition): '''a generic ufunc that wraps the workload ''' _argtys_ = [ ('args', C.pointer(C.char_p), [ATTR_NO_ALIAS]), ('dimensions', C.pointer(C.intp), [ATTR_NO_ALIAS]), ('steps', C.pointer(C.intp), [ATTR_NO_ALIAS]), ('data', C.void_p, [ATTR_NO_ALIAS]), ] def body(self, args, dimensions, steps, data): ufunc_ptr = self.depends(self.FuncDef) fnty = ufunc_ptr.type.pointee arg_ptrs = [] arg_steps = [] for i in range(len(fnty.args)+1): arg_ptrs.append(self.var_copy(args[i])) const_steps = self.var_copy(steps[i]) const_steps.invariant = True arg_steps.append(const_steps) with self.for_range(dimensions[0]) as (loop, item): callargs = [] for i, argty in enumerate(fnty.args): casted = arg_ptrs[i].cast(C.pointer(argty)) callargs.append(casted.load()) arg_ptrs[i].assign(arg_ptrs[i][arg_steps[i]:]) # increment pointer res = ufunc_ptr(*callargs, **dict(inline=True)) retval_ptr = arg_ptrs[-1].cast(C.pointer(fnty.return_type)) retval_ptr.store(res, nontemporal=True) arg_ptrs[-1].assign(arg_ptrs[-1][arg_steps[-1]:]) self.ret() def specialize(cls, func_def): '''specialize to a workload ''' cls._name_ = 'basicufunc_%s'% (func_def) cls.FuncDef = func_def
class PyObjectHead(CStruct): _fields_ = [ ('ob_refcnt', C.intp), # NOTE: not a integer, just need to match definition in numba ('ob_type', C.pointer(C.int)), ] if llvm_types._trace_refs_: # Account for _PyObject_HEAD_EXTRA _fields_ = [ ('ob_next', _intp_ptr), ('ob_prev', _intp_ptr), ] + _fields_
class PThreadAPI(CExternal): '''external declaration of pthread API ''' pthread_t = C.void_p pthread_create = Type.function( C.int, [ C.pointer(pthread_t), # thread_t C.void_p, # thread attr C.void_p, # function C.void_p ]) # arg pthread_join = Type.function(C.int, [C.void_p, C.void_p])
def body(self, args, dimensions, steps, data): func = self.depends(self.FuncDef) arg_ptrs = [] arg_steps = [] for i in range(self.nin + self.nout): arg_ptrs.append(self.var_copy(args[i])) const_steps = self.var_copy(steps[i]) const_steps.invariant = True arg_steps.append(const_steps) N = self.var_copy(dimensions[0]) N.invariant = True with self.for_range(N) as (loop, item): callargs = [] for i, arg in enumerate(self.in_args): casted = arg_ptrs[i].cast(C.pointer(arg.type)) callargs.append(casted.load()) for i, arg in enumerate(self.out_args): i += self.nin casted = arg_ptrs[i].cast(arg.type) callargs.append(casted) if self.returns_value: res = func(*callargs, inline=True) retval_ptr = arg_ptrs[self.nin].cast(C.pointer(self.return_type)) retval_ptr.store(res, nontemporal=True) else: func(*callargs, inline=True) for i in range(self.nin + self.nout): # increment pointers arg_ptrs[i].assign(arg_ptrs[i][arg_steps[i]:]) self.ret()
def gen_test_worker(mod): cb = CBuilder.new_function(mod, 'worker', C.void, [C.pointer(C.int)]) pval = cb.args[0] one = cb.constant(pval.type.pointee, 1) ct = cb.var(C.int, 0) limit = cb.constant(C.int, REPEAT) with cb.loop() as loop: with loop.condition() as setcond: setcond(ct < limit) with loop.body(): cb.atomic_add(pval, one, 'acq_rel') ct += one cb.ret() cb.close() return cb.function
def gen_test_worker(mod): cb = CBuilder.new_function(mod, 'worker', C.void, [C.pointer(C.int)]) pval = cb.args[0] one = cb.constant(pval.type.pointee, 1) ct = cb.var(C.int, 0) limit = cb.constant(C.int, REPEAT) with cb.loop() as loop: with loop.condition() as setcond: setcond( ct < limit ) with loop.body(): cb.atomic_add(pval, one, 'acq_rel') ct += one cb.ret() cb.close() return cb.function
def _dispatch_worker(self, worker, contexts, num_thread): api = WinThreadAPI(self) NULL = self.constant_null(C.void_p) lpdword_NULL = self.constant_null(C.pointer(C.int32)) zero = self.constant(C.int32, 0) intp_zero = self.constant(C.intp, 0) INFINITE = self.constant(C.int32, 0xFFFFFFFF) threads = self.array(api.handle_t, num_thread, name='threads') # self.debug("launch threads") # TODO error handling with self.for_range(num_thread) as (loop, i): threads[i] = api.CreateThread(NULL, intp_zero, worker, contexts[i].reference().cast(C.void_p), zero, lpdword_NULL) with self.for_range(num_thread) as (loop, i): api.WaitForSingleObject(threads[i], INFINITE) api.CloseHandle(threads[i])
def _dispatch_worker(self, worker, contexts, num_thread): api = WinThreadAPI(self) NULL = self.constant_null(C.void_p) lpdword_NULL = self.constant_null(C.pointer(C.int32)) zero = self.constant(C.int32, 0) intp_zero = self.constant(C.intp, 0) INFINITE = self.constant(C.int32, 0xFFFFFFFF) threads = self.array(api.handle_t, num_thread, name='threads') # self.debug("launch threads") # TODO error handling with self.for_range(num_thread) as (loop, i): threads[i] = api.CreateThread( NULL, intp_zero, worker, contexts[i].reference().cast(C.void_p), zero, lpdword_NULL) with self.for_range(num_thread) as (loop, i): api.WaitForSingleObject(threads[i], INFINITE) api.CloseHandle(threads[i])
def _fold_loop(self, f, init, cb): hctx = cb.args[0] raw_length = reduce(lambda x, y: x * y, self.dims, 1) length = cb.constant(C.int, raw_length) int_size_bytes = cb.sizeof(C.int).cast(C.int) read_size = length * int_size_bytes buf = cb.var(C.pointer(C.int)) buf_p = buf.reference().cast(C.char_pp) dltmp = cb.var(C.int) off = cb.var(C.int).assign(cb.zero) cb.cls_read(hctx, off, read_size, buf_p, dltmp.reference()) i = cb.var(C.int).assign(cb.zero) with cb.loop() as loop: with loop.condition() as setcond: setcond(i < (dltmp / int_size_bytes)) with loop.body(): init.assign(f(init, buf[i])) i += cb.one
def _fold_loop(self, f, init, cb): hctx = cb.args[0] raw_length = reduce(lambda x,y: x*y, self.dims, 1) length = cb.constant(C.int, raw_length) int_size_bytes = cb.sizeof(C.int).cast(C.int) read_size = length * int_size_bytes buf = cb.var(C.pointer(C.int)) buf_p = buf.reference().cast(C.char_pp) dltmp = cb.var(C.int) off = cb.var(C.int).assign(cb.zero) cb.cls_read(hctx, off, read_size, buf_p, dltmp.reference()) i = cb.var(C.int).assign(cb.zero) with cb.loop() as loop: with loop.condition() as setcond: setcond(i < (dltmp / int_size_bytes)) with loop.body(): init.assign(f(init, buf[i])) i += cb.one
class LibOSD(CExternal): cls_read = Type.function( C.int, [C.void_p, C.int, C.int, C.char_pp, C.pointer(C.int)]) cls_write = Type.function(C.int, [C.void_p, C.int, C.int, C.char_p]) cls_write_bl = Type.function(C.int, [C.void_p, C.int, C.int, C.void_p]) cls_write_bl_full = Type.function(C.int, [C.void_p, C.int, C.void_p]) cls_write_full = Type.function(C.int, [C.void_p, C.char_p]) cls_map_get_val = Type.function( C.int, [C.void_p, C.char_p, C.char_pp, C.pointer(C.int)]) cls_setxattr = Type.function(C.int, [C.void_p, C.char_p, C.char_p, C.int]) cls_map_get_keys = Type.function(C.int, [ C.void_p, C.char_p, C.int64, C.pointer(C.char_pp), C.pointer(C.pointer(C.int)) ]) cls_stat = Type.function( C.int, [C.void_p, C.pointer(C.int), C.pointer(C.int64)]) cls_log = Type.function(C.int, [C.int, C.char_p], True)
def gen_test_worker(mod): cb = CBuilder.new_function(mod, 'worker', C.void, [C.pointer(C.int)]) pval = cb.args[0] one = cb.constant(pval.type.pointee, 1) ct = cb.var(C.int, 0) limit = cb.constant(C.int, REPEAT) with cb.loop() as loop: with loop.condition() as setcond: setcond(ct < limit) with loop.body(): oldval = pval.atomic_load('acquire') updated = oldval + one castmp = pval.atomic_cmpxchg(oldval, updated, 'release') with cb.ifelse(castmp == oldval) as ifelse: with ifelse.then(): ct += one cb.ret() cb.close() return cb.function
def gen_test_worker(mod): cb = CBuilder.new_function(mod, 'worker', C.void, [C.pointer(C.int)]) pval = cb.args[0] one = cb.constant(pval.type.pointee, 1) ct = cb.var(C.int, 0) limit = cb.constant(C.int, REPEAT) with cb.loop() as loop: with loop.condition() as setcond: setcond( ct < limit ) with loop.body(): oldval = pval.atomic_load('acquire') updated = oldval + one castmp = pval.atomic_cmpxchg(oldval, updated, 'release') with cb.ifelse( castmp == oldval ) as ifelse: with ifelse.then(): ct += one cb.ret() cb.close() return cb.function
# -*- coding: utf-8 -*- from __future__ import print_function, division, absolute_import from numba import * from numba import llvm_types from numba import typedefs from numba.utility.cbuilder.library import register from numba.utility.cbuilder.numbacdef import NumbaCDefinition, from_numba from llvm_cbuilder import shortnames #------------------------------------------------------------------------ # Utilities #------------------------------------------------------------------------ p_py_ssize_t = shortnames.pointer(shortnames.py_ssize_t) def ob_refcnt(obj_p): return deref(p_refcnt(obj_p)) def p_refcnt(obj_p): return obj_p.cast(p_py_ssize_t) def deref(obj_p): return obj_p[0] def const(ctemp, val): return ctemp.parent.constant(shortnames.py_ssize_t, val)
# -*- coding: utf-8 -*- from __future__ import print_function, division, absolute_import from numba import * from numba import llvm_types from numba import typedefs from numba.utility.cbuilder.library import register from numba.utility.cbuilder.numbacdef import NumbaCDefinition, from_numba from llvm_cbuilder import shortnames #------------------------------------------------------------------------ # Utilities #------------------------------------------------------------------------ p_py_ssize_t = shortnames.pointer(shortnames.py_ssize_t) def ob_refcnt(obj_p): return deref(p_refcnt(obj_p)) def p_refcnt(obj_p): return obj_p.cast(p_py_ssize_t) def deref(obj_p): return obj_p[0] def const(ctemp, val): return ctemp.parent.constant(shortnames.py_ssize_t, val) def add_refcnt(obj_p, refcnt): refcnt = const(obj_p, refcnt) refs = ob_refcnt(obj_p)
class Broadcast(NumbaCDefinition): """ Transliteration of @cname('__pyx_memoryview_broadcast') cdef bint __pyx_broadcast(Py_ssize_t *dst_shape, Py_ssize_t *input_shape, Py_ssize_t *strides, int max_ndim, int ndim, bint *p_broadcast) nogil except -1: cdef Py_ssize_t i cdef int dim_offset = max_ndim - ndim for i in range(ndim): src_extent = input_shape[i] dst_extent = dst_shape[i + dim_offset] if src_extent == 1: p_broadcast[0] = True strides[i] = 0 elif dst_extent == 1: dst_shape[i + dim_offset] = src_extent elif src_extent != dst_extent: __pyx_err_extents(i, dst_shape[i], input_shape[i]) """ _name_ = "__numba_util_broadcast" _argtys_ = [ ('dst_shape', C.pointer(C.npy_intp)), ('src_shape', C.pointer(C.npy_intp)), ('src_strides', C.pointer(C.npy_intp)), ('max_ndim', C.int), ('ndim', C.int), ] _retty_ = C.int def body(self, dst_shape, src_shape, src_strides, max_ndim, ndim): dim_offset = max_ndim - ndim def constants(type): return self.constant(type, 0), self.constant(type, 1) zero, one = constants(C.npy_intp) zero_int, one_int = constants(C.int) with self.for_range(ndim) as (loop, i): src_extent = src_shape[i] dst_extent = dst_shape[i + dim_offset] with self.ifelse(src_extent == one) as ifelse: with ifelse.then(): src_strides[i] = zero with ifelse.otherwise(): with self.ifelse(dst_extent == one) as ifelse: with ifelse.then(): dst_shape[i + dim_offset] = src_extent with ifelse.otherwise(): with self.ifelse( src_extent != dst_extent) as ifelse: with ifelse.then(): # Shape mismatch self.ret(zero_int) self.ret(one_int)
def _make_array_type(self, ndim, cb): return C.int if ndim == 1 else C.pointer( self._make_array_type(ndim - 1, cb))
class GUFuncEntry(CDefinition): '''a generalized ufunc that wraps a numba jit'ed function NOTE: Currently, this only works for array return type. And, return type must be the last argument of the nubma jit'ed function. ''' _argtys_ = [ ('args', C.pointer(C.char_p)), ('dimensions', C.pointer(C.intp)), ('steps', C.pointer(C.intp)), ('data', C.void_p), ] def _outer_loop(self, dargs, dimensions, pyarys, steps, data): # implement outer loop innerfunc = self.depends(self.FuncDef) with self.for_range(dimensions[0]) as (loop, idx): args = [] for i, (arg, arg_type) in enumerate(zip(pyarys, innerfunc.handle.args)): if C.pointer(PyArray.llvm_type()) != arg_type.type: # scalar val = arg.data[0:].cast(C.pointer(arg_type.type)).load() args.append(val) else: casted = arg.reference().cast(arg_type.type) args.append(casted) innerfunc(*args) for i, ary in enumerate(pyarys): ary.data.assign(ary.data[steps[i]:]) def body(self, args, dimensions, steps, data): diminfo = list(_parse_signature(self.Signature)) n_pyarys = len(diminfo) assert n_pyarys == len(self.dtypes) # extract unique dimension names dims = [] for grp in diminfo: for it in grp: if it not in dims: if it: dims.append(it) # build pyarrays for argument to inner function pyarys = [self.var(PyArray) for _ in range(n_pyarys)] # populate pyarrays step_offset = len(pyarys) for i, (dtype, ary) in enumerate(zip(self.dtypes, pyarys)): ary_ndim = len([x for x in diminfo[i] if x]) ary_dims = [] for k in diminfo[i]: if k: ary_dims.append(dimensions[1 + dims.index(k)]) else: ary_dims.append(self.constant(C.intp, 0)) ary_steps = [] if not ary_ndim: ary_steps.append(self.constant(C.intp, 0)) for j in range(ary_ndim): ary_steps.append(steps[step_offset]) step_offset += 1 ary.fakeit(dtype, args[i], ary_dims, ary_steps) self._outer_loop(args, dimensions, pyarys, steps, data) self.ret() @classmethod def specialize(cls, dtypes, signature, func_def): '''specialize to a workload ''' signature = signature.replace(' ', '') # remove all spaces cls.dtypes = dtypes cls._name_ = 'gufunc_%s_%s' % (signature, func_def) cls.FuncDef = func_def cls.Signature = signature
def _get_tys_list(self): types_lists = [] for numba_func in self.translates: dtype_nums = [] types_lists.append(dtype_nums) for arg_type in self.get_argtypes(numba_func): if arg_type.is_array: arg_type = arg_type.dtype dtype_nums.append(arg_type.get_dtype()) return types_lists GUFuncVectorize = GUFuncASTVectorize _intp_ptr = C.pointer(C.intp) class PyObjectHead(CStruct): _fields_ = [ ('ob_refcnt', C.intp), # NOTE: not a integer, just need to match definition in numba ('ob_type', C.void_p), ] if llvm_types._trace_refs_: # Account for _PyObject_HEAD_EXTRA _fields_ = [ ('ob_next', _intp_ptr), ('ob_prev', _intp_ptr), ] + _fields_
def _make_array_type(self, ndim, cb): return C.int if ndim == 1 else C.pointer(self._make_array_type(ndim-1, cb))
class BasicUFunc(CDefinition): '''a generic ufunc that wraps the workload ''' _argtys_ = [ ('args', C.pointer(C.char_p), [ATTR_NO_ALIAS]), ('dimensions', C.pointer(C.intp), [ATTR_NO_ALIAS]), ('steps', C.pointer(C.intp), [ATTR_NO_ALIAS]), ('data', C.void_p, [ATTR_NO_ALIAS]), ] def body(self, args, dimensions, steps, data): ufunc_ptr = self.depends(self.FuncDef) fnty = ufunc_ptr.type.pointee # void return type implies return by reference in last argument is_void_ret = fnty.return_type.kind == lc.TYPE_VOID argtys = fnty.args[:-1] if is_void_ret else fnty.args arg_ptrs = [] arg_steps = [] for i in range(len(argtys) + 1): arg_ptrs.append(self.var_copy(args[i])) const_steps = self.var_copy(steps[i]) const_steps.invariant = True arg_steps.append(const_steps) with self.for_range(dimensions[0]) as (loop, item): callargs = [] for i, argty in enumerate(argtys): if argty.kind == lc.TYPE_POINTER: casted = arg_ptrs[i].cast(argty) callargs.append(casted) else: casted = arg_ptrs[i].cast(C.pointer(argty)) callargs.append(casted.load()) # increment pointer arg_ptrs[i].assign(arg_ptrs[i][arg_steps[i]:]) if is_void_ret: retty = fnty.args[-1] assert retty.kind == lc.TYPE_POINTER retval = self.var(retty.pointee) callargs.append(retval.ref) for i in callargs: print(i) ufunc_ptr(*callargs, **dict(inline=True)) res = retval retval_ptr = arg_ptrs[-1].cast(retty) else: res = ufunc_ptr(*callargs, **dict(inline=True)) retval_ptr = arg_ptrs[-1].cast(C.pointer(fnty.return_type)) retval_ptr.store(res, nontemporal=True) arg_ptrs[-1].assign(arg_ptrs[-1][arg_steps[-1]:]) self.ret() def specialize(cls, func_def): '''specialize to a workload ''' cls._name_ = 'basicufunc_%s' % (func_def) cls.FuncDef = func_def
class SliceArray(CDefinition): _name_ = "slice" _retty_ = C.char_p _argtys_ = [ ('data', C.char_p), ('in_shape', C.pointer(C.npy_intp)), ('in_strides', C.pointer(C.npy_intp)), ('out_shape', C.pointer(C.npy_intp)), ('out_strides', C.pointer(C.npy_intp)), ('start', C.npy_intp), ('stop', C.npy_intp), ('step', C.npy_intp), ('src_dim', C.int), ('dst_dim', C.int), ] def _adjust_given_index(self, extent, negative_step, index, is_start): # Tranliterate the below code to llvm cbuilder # TODO: write in numba # For the start index in start:stop:step, do: # if have_start: # if start < 0: # start += shape # if start < 0: # start = 0 # elif start >= shape: # if negative_step: # start = shape - 1 # else: # start = shape # else: # if negative_step: # start = shape - 1 # else: # start = 0 # For the stop index, do: # if stop is not None: # if stop < 0: # stop += extent # if stop < 0: # stop = 0 # elif stop > extent: # stop = extent # else: # if negative_step: # stop = -1 # else: # stop = extent one, zero = get_constants(self) with self.ifelse(index < zero) as ifelse: with ifelse.then(): index += extent with self.ifelse(index < zero) as ifelse_inner: with ifelse_inner.then(): index.assign(zero) with ifelse.otherwise(): with self.ifelse(index >= extent) as ifelse: with ifelse.then(): if is_start: # index is 'start' index with self.ifelse(negative_step) as ifelse: with ifelse.then(): index.assign(extent - one) with ifelse.otherwise(): index.assign(extent) else: # index is 'stop' index. Stop is exclusive, to # we don't care about the sign of the step index.assign(extent) def _set_default_index(self, default1, default2, negative_step, index): with self.ifelse(negative_step) as ifelse: with ifelse.then(): index.assign(default1) with ifelse.otherwise(): index.assign(default2) def adjust_index(self, extent, negative_step, index, default1, default2, is_start=False, have_index=True): if have_index: self._adjust_given_index(extent, negative_step, index, is_start) else: self._set_default_index(default1, default2, negative_step, index) def body(self, data, in_shape, in_strides, out_shape, out_strides, start, stop, step, src_dim, dst_dim): stride = in_strides[src_dim] extent = in_shape[src_dim] one, zero = get_constants(self) if not self.have_step: step = one negative_step = step < zero self.adjust_index(extent, negative_step, start, default1=extent - one, default2=zero, is_start=True, have_index=self.have_start) self.adjust_index(extent, negative_step, stop, default1=-one, default2=extent, have_index=self.have_stop) # self.debug("extent", extent) # self.debug("negative_step", negative_step.cast(C.npy_intp)) # self.debug("start/stop/step", start, stop, step) new_extent = self.var(C.npy_intp) new_extent.assign((stop - start) / step) with self.ifelse((stop - start) % step != zero) as ifelse: with ifelse.then(): new_extent += one with self.ifelse(new_extent < zero) as ifelse: with ifelse.then(): new_extent.assign(zero) result = self.var(data.type, name='result') result.assign(data[start * stride:]) out_shape[dst_dim] = new_extent # self.debug("new_extent", new_extent) # self.debug("out stride:", dst_dim, stride * step) out_strides[dst_dim] = stride * step self.ret(result) def specialize(self, context, have_start, have_stop, have_step): self.context = context self.have_start = have_start self.have_stop = have_stop self.have_step = have_step self._name_ = "slice_%s_%s_%s" % (have_start, have_stop, have_step)
def _get_tys_list(self): types_lists = [] for numba_func in self.translates: dtype_nums = [] types_lists.append(dtype_nums) for arg_type in self.get_argtypes(numba_func): if arg_type.is_array: arg_type = arg_type.dtype dtype_nums.append(arg_type.get_dtype()) return types_lists GUFuncVectorize = GUFuncASTVectorize _intp_ptr = C.pointer(C.intp) class PyObjectHead(CStruct): _fields_ = [ ('ob_refcnt', C.intp), # NOTE: not a integer, just need to match definition in numba ('ob_type', C.pointer(C.int)), ] if llvm_types._trace_refs_: # Account for _PyObject_HEAD_EXTRA _fields_ = [ ('ob_next', _intp_ptr), ('ob_prev', _intp_ptr), ] + _fields_