def build(self, cres, signature): # Buider wrapper for ufunc entry point ctx = cres.target_context library = cres.library llvm_func = library.get_function(cres.fndesc.llvm_func_name) env = None if cres.objectmode: # Get env env = cres.environment assert env is not None ll_intp = cres.target_context.get_value_type(types.intp) ll_pyobj = cres.target_context.get_value_type(types.pyobject) envptr = lc.Constant.int(ll_intp, id(env)).inttoptr(ll_pyobj) else: envptr = None wrapper = build_ufunc_wrapper(library, ctx, llvm_func, signature, cres.objectmode, envptr) ptr = library.get_pointer_to_function(wrapper.name) # Get dtypes dtypenums = [as_dtype(a).num for a in signature.args] dtypenums.append(as_dtype(signature.return_type).num) return dtypenums, ptr, env
def _build_element_wise_ufunc_wrapper(cres, signature): '''Build a wrapper for the ufunc loop entry point given by the compilation result object, using the element-wise signature. ''' ctx = cres.target_context library = cres.library fname = cres.fndesc.llvm_func_name env = None if cres.objectmode: # Get env env = cres.environment assert env is not None ll_intp = cres.target_context.get_value_type(types.intp) ll_pyobj = cres.target_context.get_value_type(types.pyobject) envptr = lc.Constant.int(ll_intp, id(env)).inttoptr(ll_pyobj) else: envptr = None ptr = build_ufunc_wrapper(library, ctx, fname, signature, cres.objectmode, envptr, env) # Get dtypes dtypenums = [as_dtype(a).num for a in signature.args] dtypenums.append(as_dtype(signature.return_type).num) return dtypenums, ptr, env
def _build_element_wise_ufunc_wrapper(cres, signature): '''Build a wrapper for the ufunc loop entry point given by the compilation result object, using the element-wise signature. ''' ctx = cres.target_context library = cres.library llvm_func = library.get_function(cres.fndesc.llvm_func_name) env = None if cres.objectmode: # Get env env = cres.environment assert env is not None ll_intp = cres.target_context.get_value_type(types.intp) ll_pyobj = cres.target_context.get_value_type(types.pyobject) envptr = lc.Constant.int(ll_intp, id(env)).inttoptr(ll_pyobj) else: envptr = None wrapper = build_ufunc_wrapper(library, ctx, llvm_func, signature, cres.objectmode, envptr, env) ptr = library.get_pointer_to_function(wrapper.name) # Get dtypes dtypenums = [as_dtype(a).num for a in signature.args] dtypenums.append(as_dtype(signature.return_type).num) return dtypenums, ptr, env
def find_common_dtype_from_numpy_dtypes(array_types, scalar_types): """Used to find common numba dtype for a sequences of numba dtypes each representing some numpy dtype""" np_array_dtypes = [numpy_support.as_dtype(dtype) for dtype in array_types] np_scalar_dtypes = [numpy_support.as_dtype(dtype) for dtype in scalar_types] np_common_dtype = numpy.find_common_type(np_array_dtypes, np_scalar_dtypes) numba_common_dtype = numpy_support.from_dtype(np_common_dtype) return numba_common_dtype
def kernel_wrapper(values): n = len(values) inputs = [np.empty(n, dtype=numpy_support.as_dtype(tp)) for tp in argtypes] output = np.empty(n, dtype=numpy_support.as_dtype(restype)) for i, vs in enumerate(values): for v, inp in zip(vs, inputs): inp[i] = v args = [output] + inputs kernel[int(math.ceil(n / 256)), 256](*args) return list(output)
def check_round(cfunc, values, inty, outty, decimals): # Create input and output arrays of the right type arr = values.astype(as_dtype(inty)) out = np.zeros_like(arr).astype(as_dtype(outty)) pyout = out.copy() _fixed_np_round(arr, decimals, pyout) cfunc(arr, decimals, out) np.testing.assert_allclose(out, pyout) # Output shape mismatch with self.assertRaises(ValueError) as raises: cfunc(arr, decimals, out[1:]) self.assertEqual(str(raises.exception), "invalid output shape")
def kernel_wrapper(values): n = len(values) inputs = [ np.empty(n, dtype=numpy_support.as_dtype(tp)) for tp in argtypes ] output = np.empty(n, dtype=numpy_support.as_dtype(restype)) for i, vs in enumerate(values): for v, inp in zip(vs, inputs): inp[i] = v args = [output] + inputs kernel[int(math.ceil(n / 256)), 256](*args) return list(output)
def _build_element_wise_ufunc_wrapper(cres, signature): '''Build a wrapper for the ufunc loop entry point given by the compilation result object, using the element-wise signature. ''' ctx = cres.target_context library = cres.library fname = cres.fndesc.llvm_func_name with global_compiler_lock: ptr = build_ufunc_wrapper(library, ctx, fname, signature, cres.objectmode, cres) # Get dtypes dtypenums = [as_dtype(a).num for a in signature.args] dtypenums.append(as_dtype(signature.return_type).num) return dtypenums, ptr, cres.environment
def build(self, cres): """ Returns (dtype numbers, function ptr, EnvironmentObject) """ # Buider wrapper for ufunc entry point signature = cres.signature info = build_gufunc_wrapper( self.py_func, cres, self.sin, self.sout, cache=self.cache, is_parfors=False, ) env = info.env ptr = info.library.get_pointer_to_function(info.name) # Get dtypes dtypenums = [] for a in signature.args: if isinstance(a, types.Array): ty = a.dtype else: ty = a dtypenums.append(as_dtype(ty).num) return dtypenums, ptr, env
def _build_element_wise_ufunc_wrapper(cres, signature): '''Build a wrapper for the ufunc loop entry point given by the compilation result object, using the element-wise signature. ''' ctx = cres.target_context library = cres.library fname = cres.fndesc.llvm_func_name with global_compiler_lock: info = build_ufunc_wrapper(library, ctx, fname, signature, cres.objectmode, cres) ptr = info.library.get_pointer_to_function(info.name) # Get dtypes dtypenums = [as_dtype(a).num for a in signature.args] dtypenums.append(as_dtype(signature.return_type).num) return dtypenums, ptr, cres.environment
def build(self, cres): """ Returns (dtype numbers, function ptr, EnvironmentObject) """ _launch_threads() _init() # Build wrapper for ufunc entry point ctx = cres.target_context library = cres.library signature = cres.signature ptr, env = build_gufunc_wrapper(library, ctx, signature, self.sin, self.sout, fndesc=cres.fndesc, env=cres.environment) # Get dtypes dtypenums = [] for a in signature.args: if isinstance(a, types.Array): ty = a.dtype else: ty = a dtypenums.append(as_dtype(ty).num) return dtypenums, ptr, env
def build(self, cres): """ Returns (dtype numbers, function ptr, EnvironmentObject) """ # Buider wrapper for ufunc entry point ctx = cres.target_context library = cres.library signature = cres.signature llvm_func = library.get_function(cres.fndesc.llvm_func_name) wrapper, env = build_gufunc_wrapper(library, ctx, llvm_func, signature, self.sin, self.sout, fndesc=cres.fndesc, env=cres.environment) ptr = library.get_pointer_to_function(wrapper.name) # Get dtypes dtypenums = [] for a in signature.args: if isinstance(a, types.Array): ty = a.dtype else: ty = a dtypenums.append(as_dtype(ty).num) return dtypenums, ptr, env
def test_hypot(self, flags=enable_pyobj_flags): pyfunc = hypot x_types = [types.int64, types.uint64, types.float32, types.float64] x_values = [1, 2, 3, 4, 5, 6, .21, .34] y_values = [x + 2 for x in x_values] # Issue #563: precision issues with math.hypot() under Windows. prec = 'single' self.run_binary(pyfunc, x_types, x_values, y_values, flags, prec) # Check that values that overflow in naive implementations do not # in the numba impl def naive_hypot(x, y): return math.sqrt(x * x + y * y) for fltty in (types.float32, types.float64): cr = self.ccache.compile(pyfunc, (fltty, fltty), flags=flags) cfunc = cr.entry_point dt = numpy_support.as_dtype(fltty).type val = dt(np.finfo(dt).max / 30.) nb_ans = cfunc(val, val) self.assertPreciseEqual(nb_ans, pyfunc(val, val), prec='single') self.assertTrue(np.isfinite(nb_ans)) with warnings.catch_warnings(): warnings.simplefilter("error", RuntimeWarning) self.assertRaisesRegexp(RuntimeWarning, 'overflow encountered in .*_scalars', naive_hypot, val, val)
def compile_udf(udf, type_signature): """Copmile ``udf`` with `numba` Compile a python callable function ``udf`` with `numba.cuda.jit(device=True)` using ``type_signature`` into CUDA PTX together with the generated output type. The output is expected to be passed to the PTX parser in `libcudf` to generate a CUDA device funtion to be inlined into CUDA kernels, compiled at runtime and launched. Parameters -------- udf: a python callable function type_signature: a tuple that specifies types of each of the input parameters of ``udf``. The types should be one in `numba.types` and could be converted from numpy types with `numba.numpy_support.from_dtype(...)`. Returns -------- ptx_code: The compiled CUDA PTX output_type: An numpy type """ decorated_udf = cuda.jit(udf, device=True) compiled = decorated_udf.compile(type_signature) ptx_code = decorated_udf.inspect_ptx(type_signature).decode("utf-8") output_type = numpy_support.as_dtype(compiled.signature.return_type) return (ptx_code, output_type.type)
def test_hypot(self, flags=enable_pyobj_flags): pyfunc = hypot x_types = [types.int64, types.uint64, types.float32, types.float64] x_values = [1, 2, 3, 4, 5, 6, .21, .34] y_values = [x + 2 for x in x_values] # Issue #563: precision issues with math.hypot() under Windows. prec = 'single' if sys.platform == 'win32' else 'exact' self.run_binary(pyfunc, x_types, x_values, y_values, flags, prec) # Check that values that overflow in naive implementations do not # in the numba impl def naive_hypot(x, y): return math.sqrt(x * x + y * y) for fltty in (types.float32, types.float64): cr = self.ccache.compile(pyfunc, (fltty, fltty), flags=flags) cfunc = cr.entry_point dt = numpy_support.as_dtype(fltty).type val = dt(np.finfo(dt).max / 30.) nb_ans = cfunc(val, val) self.assertPreciseEqual(nb_ans, pyfunc(val, val), prec='single') self.assertTrue(np.isfinite(nb_ans)) with warnings.catch_warnings(): warnings.simplefilter("error", RuntimeWarning) self.assertRaisesRegexp(RuntimeWarning, 'overflow encountered in .*_scalars', naive_hypot, val, val)
def map_struct_to_record_dtype(cffi_type): """Convert a cffi type into a NumPy Record dtype """ fields = { 'names': [], 'formats': [], 'offsets': [], 'itemsize': ffi.sizeof(cffi_type), } is_aligned = True for k, v in cffi_type.fields: # guard unsupport values if v.bitshift != -1: msg = "field {!r} has bitshift, this is not supported" raise ValueError(msg.format(k)) if v.flags != 0: msg = "field {!r} has flags, this is not supported" raise ValueError(msg.format(k)) if v.bitsize != -1: msg = "field {!r} has bitsize, this is not supported" raise ValueError(msg.format(k)) dtype = numpy_support.as_dtype(map_type(v.type, use_record_dtype=True), ) fields['names'].append(k) fields['formats'].append(dtype) fields['offsets'].append(v.offset) # Check alignment is_aligned &= (v.offset % dtype.alignment == 0) return numpy_support.from_dtype(np.dtype(fields, align=is_aligned))
def test_record_dtype_with_titles_roundtrip(self): recdtype = np.dtype([(("title a", 'a'), np.float), ('b', np.float)]) nbtype = numpy_support.from_dtype(recdtype) self.assertTrue(nbtype.is_title('title a')) self.assertFalse(nbtype.is_title('a')) self.assertFalse(nbtype.is_title('b')) got = numpy_support.as_dtype(nbtype) self.assertTrue(got, recdtype)
def xinfo_impl(arg): nbty = getattr(arg, 'dtype', arg) f = np_func(as_dtype(nbty)) data = tuple([getattr(f, x) for x in attr]) def impl(arg): return container(*data) return impl
def _build_element_wise_ufunc_wrapper(cres, signature): '''Build a wrapper for the ufunc loop entry point given by the compilation result object, using the element-wise signature. ''' ctx = cres.target_context library = cres.library fname = cres.fndesc.llvm_func_name env = cres.environment envptr = env.as_pointer(ctx) with compiler.lock_compiler: ptr = build_ufunc_wrapper(library, ctx, fname, signature, cres.objectmode, envptr, env) # Get dtypes dtypenums = [as_dtype(a).num for a in signature.args] dtypenums.append(as_dtype(signature.return_type).num) return dtypenums, ptr, env
def array_cumprod(context, builder, sig, args): scalar_dtype = sig.return_type.dtype dtype = as_dtype(scalar_dtype) def array_cumprod_impl(arr): size = 1 for i in arr.shape: size = size * i out = numpy.empty(size, dtype) c = 1 for idx, v in enumerate(arr.flat): c *= v out[idx] = c return out res = context.compile_internal(builder, array_cumprod_impl, sig, args, locals=dict(c=scalar_dtype)) return impl_ret_new_ref(context, builder, sig.return_type, res)
def roots_impl(p): # cast int vectors to float cf. numpy, this is a bit dicey as # the roots could be complex which will fail anyway ty = getattr(p, 'dtype', p) if isinstance(ty, types.Integer): cast_t = np.float64 else: cast_t = np_support.as_dtype(ty) def roots_impl(p): # impl based on numpy: # https://github.com/numpy/numpy/blob/master/numpy/lib/polynomial.py if len(p.shape) != 1: raise ValueError("Input must be a 1d array.") non_zero = np.nonzero(p)[0] if len(non_zero) == 0: return np.zeros(0, dtype=cast_t) tz = len(p) - non_zero[-1] - 1 # pull out the coeffs selecting between possible zero pads p = p[int(non_zero[0]):int(non_zero[-1]) + 1] n = len(p) if n > 1: # construct companion matrix, ensure fortran order # to give to eigvals, write to upper diag and then # transpose. A = np.diag(np.ones((n - 2,), cast_t), 1).T A[0, :] = -p[1:] / p[0] # normalize roots = np.linalg.eigvals(A) else: roots = np.zeros(0, dtype=cast_t) # add in additional zeros on the end if needed if tz > 0: return np.hstack((roots, np.zeros(tz, dtype=cast_t))) else: return roots return roots_impl
def build(self, cres): """ Returns (dtype numbers, function ptr, EnvironmentObject) """ # Buider wrapper for ufunc entry point signature = cres.signature ptr, env = build_gufunc_wrapper(self.py_func, cres, self.sin, self.sout, cache=self.cache) # Get dtypes dtypenums = [] for a in signature.args: if isinstance(a, types.Array): ty = a.dtype else: ty = a dtypenums.append(as_dtype(ty).num) return dtypenums, ptr, env
def build(self, cres): """ Returns (dtype numbers, function ptr, EnvironmentObject) """ # Buider wrapper for ufunc entry point signature = cres.signature ptr, env, wrapper_name = build_gufunc_wrapper(self.py_func, cres, self.sin, self.sout, cache=self.cache) # Get dtypes dtypenums = [] for a in signature.args: if isinstance(a, types.Array): ty = a.dtype else: ty = a dtypenums.append(as_dtype(ty).num) return dtypenums, ptr, env
def array(self, shape, dtype): dtype = numpy_support.as_dtype(dtype) # Dynamic shared memory is requested with size 0 - this all shares the # same underlying memory if shape == 0: # Count must be the maximum number of whole elements that fit in the # buffer (Numpy complains if the buffer is not a multiple of the # element size) count = self._dynshared_size // dtype.itemsize return np.frombuffer(self._dynshared.data, dtype=dtype, count=count) # Otherwise, identify allocations by source file and line number caller = traceback.extract_stack()[-2][0:2] res = self._allocations.get(caller) if res is None: res = np.empty(shape, dtype) self._allocations[caller] = res return res
def test_cfunc_callback(self): ffi = self.get_ffi() big_struct = ffi.typeof('big_struct') nb_big_struct = cffi_support.map_type(big_struct, use_record_dtype=True) sig = cffi_support.map_type(ffi.typeof('myfunc'), use_record_dtype=True) @njit def calc(base): tmp = 0 for i in range(base.size): elem = base[i] tmp += elem.i1 * elem.f2 / elem.d3 tmp += base[i].af4.sum() return tmp @cfunc(sig) def foo(ptr, n): base = carray(ptr, n) return calc(base) # Make data mydata = ffi.new('big_struct[3]') ptr = ffi.cast('big_struct*', mydata) for i in range(3): ptr[i].i1 = i * 123 ptr[i].f2 = i * 213 ptr[i].d3 = (1 + i) * 213 for j in range(9): ptr[i].af4[j] = i * 10 + j # Address of my data addr = int(ffi.cast('size_t', ptr)) got = foo.ctypes(addr, 3) # Make numpy array from the cffi buffer array = np.ndarray( buffer=ffi.buffer(mydata), dtype=numpy_support.as_dtype(nb_big_struct), shape=3, ) expect = calc(array) self.assertEqual(got, expect)
def test_pickling_vectorize(self): @vectorize(['intp(intp)', 'float64(float64)'], target='cuda') def cuda_vect(x): return x * 2 # accommodate int representations in np.arange npty = numpy_support.as_dtype(types.intp) # get expected result ary = np.arange(10, dtype=npty) expected = cuda_vect(ary) # first pickle foo1 = pickle.loads(pickle.dumps(cuda_vect)) del cuda_vect got1 = foo1(ary) np.testing.assert_equal(expected, got1) # second pickle foo2 = pickle.loads(pickle.dumps(foo1)) del foo1 got2 = foo2(ary) np.testing.assert_equal(expected, got2)
def array_cumprod(context, builder, sig, args): scalar_dtype = sig.return_type.dtype dtype = as_dtype(scalar_dtype) def array_cumprod_impl(arr): size = 1 for i in arr.shape: size = size * i out = np.empty(size, dtype) c = 1 for idx, v in enumerate(arr.flat): c *= v out[idx] = c return out res = context.compile_internal(builder, array_cumprod_impl, sig, args, locals=dict(c=scalar_dtype)) return impl_ret_new_ref(context, builder, sig.return_type, res)
def hpat_arrays_append_overload(A, B): '''Function for appending underlying arrays (A and B) or list/tuple of arrays B to an array A''' if isinstance(A, types.Array): if isinstance(B, types.Array): def _append_single_numeric_impl(A, B): return numpy.concatenate(( A, B, )) return _append_single_numeric_impl elif isinstance(B, (types.UniTuple, types.List)): # TODO: this heavily relies on B being a homogeneous tuple/list - find a better way # to resolve common dtype of heterogeneous sequence of arrays np_dtypes = [ numpy_support.as_dtype(A.dtype), numpy_support.as_dtype(B.dtype.dtype) ] np_common_dtype = numpy.find_common_type([], np_dtypes) numba_common_dtype = numpy_support.from_dtype(np_common_dtype) # TODO: refactor to use numpy.concatenate when Numba supports building a tuple at runtime def _append_list_numeric_impl(A, B): total_length = len(A) + numpy.array([len(arr) for arr in B]).sum() new_data = numpy.empty(total_length, numba_common_dtype) stop = len(A) new_data[:stop] = A for arr in B: start = stop stop = start + len(arr) new_data[start:stop] = arr return new_data return _append_list_numeric_impl elif A == string_array_type: if B == string_array_type: def _append_single_string_array_impl(A, B): total_size = len(A) + len(B) total_chars = num_total_chars(A) + num_total_chars(B) new_data = sdc.str_arr_ext.pre_alloc_string_array( total_size, total_chars) pos = 0 pos += append_string_array_to(new_data, pos, A) pos += append_string_array_to(new_data, pos, B) return new_data return _append_single_string_array_impl elif (isinstance(B, (types.UniTuple, types.List)) and B.dtype == string_array_type): def _append_list_string_array_impl(A, B): array_list = [A] + list(B) total_size = numpy.array([len(arr) for arr in array_list]).sum() total_chars = numpy.array( [num_total_chars(arr) for arr in array_list]).sum() new_data = sdc.str_arr_ext.pre_alloc_string_array( total_size, total_chars) pos = 0 pos += append_string_array_to(new_data, pos, A) for arr in B: pos += append_string_array_to(new_data, pos, arr) return new_data return _append_list_string_array_impl
def _stencil_wrapper(self, result, sigret, return_type, typemap, calltypes, *args): # Overall approach: # 1) Construct a string containing a function definition for the stencil function # that will execute the stencil kernel. This function definition includes a # unique stencil function name, the parameters to the stencil kernel, loop # nests across the dimenions of the input array. Those loop nests use the # computed stencil kernel size so as not to try to compute elements where # elements outside the bounds of the input array would be needed. # 2) The but of the loop nest in this new function is a special sentinel # assignment. # 3) Get the IR of this new function. # 4) Split the block containing the sentinel assignment and remove the sentinel # assignment. Insert the stencil kernel IR into the stencil function IR # after label and variable renaming of the stencil kernel IR to prevent # conflicts with the stencil function IR. # 5) Compile the combined stencil function IR + stencil kernel IR into existence. # Copy the kernel so that our changes for this callsite # won't effect other callsites. (kernel_copy, copy_calltypes) = self.copy_ir_with_calltypes(self.kernel_ir, calltypes) # The stencil kernel body becomes the body of a loop, for which args aren't needed. ir_utils.remove_args(kernel_copy.blocks) first_arg = kernel_copy.arg_names[0] in_cps, out_cps = ir_utils.copy_propagate(kernel_copy.blocks, typemap) name_var_table = ir_utils.get_name_var_table(kernel_copy.blocks) ir_utils.apply_copy_propagate(kernel_copy.blocks, in_cps, name_var_table, typemap, copy_calltypes) if "out" in name_var_table: raise ValueError( "Cannot use the reserved word 'out' in stencil kernels.") sentinel_name = ir_utils.get_unused_var_name("__sentinel__", name_var_table) if config.DEBUG_ARRAY_OPT == 1: print("name_var_table", name_var_table, sentinel_name) the_array = args[0] if config.DEBUG_ARRAY_OPT == 1: print("_stencil_wrapper", return_type, return_type.dtype, type(return_type.dtype), args) ir_utils.dump_blocks(kernel_copy.blocks) # We generate a Numba function to execute this stencil and here # create the unique name of this function. stencil_func_name = "__numba_stencil_%s_%s" % (hex( id(the_array)).replace("-", "_"), self.id) # We will put a loop nest in the generated function for each # dimension in the input array. Here we create the name for # the index variable for each dimension. index0, index1, ... index_vars = [] for i in range(the_array.ndim): index_var_name = ir_utils.get_unused_var_name( "index" + str(i), name_var_table) index_vars += [index_var_name] # Create extra signature for out and neighborhood. out_name = ir_utils.get_unused_var_name("out", name_var_table) neighborhood_name = ir_utils.get_unused_var_name( "neighborhood", name_var_table) sig_extra = "" if result is not None: sig_extra += ", {}=None".format(out_name) if "neighborhood" in dict(self.kws): sig_extra += ", {}=None".format(neighborhood_name) # Get a list of the standard indexed array names. standard_indexed = self.options.get("standard_indexing", []) if first_arg in standard_indexed: raise ValueError("The first argument to a stencil kernel must " "use relative indexing, not standard indexing.") if len(set(standard_indexed) - set(kernel_copy.arg_names)) != 0: raise ValueError("Standard indexing requested for an array name " "not present in the stencil kernel definition.") # Add index variables to getitems in the IR to transition the accesses # in the kernel from relative to regular Python indexing. Returns the # computed size of the stencil kernel and a list of the relatively indexed # arrays. kernel_size, relatively_indexed = self.add_indices_to_kernel( kernel_copy, index_vars, the_array.ndim, self.neighborhood, standard_indexed) if self.neighborhood is None: self.neighborhood = kernel_size if config.DEBUG_ARRAY_OPT == 1: print("After add_indices_to_kernel") ir_utils.dump_blocks(kernel_copy.blocks) # The return in the stencil kernel becomes a setitem for that # particular point in the iteration space. ret_blocks = self.replace_return_with_setitem(kernel_copy.blocks, index_vars, out_name) if config.DEBUG_ARRAY_OPT == 1: print("After replace_return_with_setitem", ret_blocks) ir_utils.dump_blocks(kernel_copy.blocks) # Start to form the new function to execute the stencil kernel. func_text = "def {}({}{}):\n".format(stencil_func_name, ",".join(kernel_copy.arg_names), sig_extra) # Get loop ranges for each dimension, which could be either int # or variable. In the latter case we'll use the extra neighborhood # argument to the function. ranges = [] for i in range(the_array.ndim): if isinstance(kernel_size[i][0], int): lo = kernel_size[i][0] hi = kernel_size[i][1] else: lo = "{}[{}][0]".format(neighborhood_name, i) hi = "{}[{}][1]".format(neighborhood_name, i) ranges.append((lo, hi)) # If there are more than one relatively indexed arrays, add a call to # a function that will raise an error if any of the relatively indexed # arrays are of different size than the first input array. if len(relatively_indexed) > 1: func_text += " raise_if_incompatible_array_sizes(" + first_arg for other_array in relatively_indexed: if other_array != first_arg: func_text += "," + other_array func_text += ")\n" # Get the shape of the first input array. shape_name = ir_utils.get_unused_var_name("full_shape", name_var_table) func_text += " {} = {}.shape\n".format(shape_name, first_arg) # If we have to allocate the output array (the out argument was not used) # then us numpy.full if the user specified a cval stencil decorator option # or np.zeros if they didn't to allocate the array. if result is None: return_type_name = numpy_support.as_dtype( return_type.dtype).type.__name__ if "cval" in self.options: cval = self.options["cval"] if return_type.dtype != typing.typeof.typeof(cval): raise ValueError( "cval type does not match stencil return type.") out_init = "{} = np.full({}, {}, dtype=np.{})\n".format( out_name, shape_name, cval, return_type_name) else: out_init = "{} = np.zeros({}, dtype=np.{})\n".format( out_name, shape_name, return_type_name) func_text += " " + out_init offset = 1 # Add the loop nests to the new function. for i in range(the_array.ndim): for j in range(offset): func_text += " " # ranges[i][0] is the minimum index used in the i'th dimension # but minimum's greater than 0 don't preclude any entry in the array. # So, take the minimum of 0 and the minimum index found in the kernel # and this will be a negative number (potentially -0). Then, we do # unary - on that to get the positive offset in this dimension whose # use is precluded. # ranges[i][1] is the maximum of 0 and the observed maximum index # in this dimension because negative maximums would not cause us to # preclude any entry in the array from being used. func_text += ("for {} in range(-min(0,{})," "{}[{}]-max(0,{})):\n").format( index_vars[i], ranges[i][0], shape_name, i, ranges[i][1]) offset += 1 for j in range(offset): func_text += " " # Put a sentinel in the code so we can locate it in the IR. We will # remove this sentinel assignment and replace it with the IR for the # stencil kernel body. func_text += "{} = 0\n".format(sentinel_name) func_text += " return {}\n".format(out_name) if config.DEBUG_ARRAY_OPT == 1: print("new stencil func text") print(func_text) # Force the new stencil function into existence. exec_(func_text) in globals(), locals() stencil_func = eval(stencil_func_name) if sigret is not None: pysig = utils.pysignature(stencil_func) sigret.pysig = pysig # Get the IR for the newly created stencil function. stencil_ir = compiler.run_frontend(stencil_func) ir_utils.remove_dels(stencil_ir.blocks) # rename all variables in stencil_ir afresh var_table = ir_utils.get_name_var_table(stencil_ir.blocks) new_var_dict = {} reserved_names = ( [sentinel_name, out_name, neighborhood_name, shape_name] + kernel_copy.arg_names + index_vars) for name, var in var_table.items(): if not name in reserved_names: new_var_dict[name] = ir_utils.mk_unique_var(name) ir_utils.replace_var_names(stencil_ir.blocks, new_var_dict) stencil_stub_last_label = max(stencil_ir.blocks.keys()) + 1 # Shift lables in the kernel copy so they are guaranteed unique # and don't conflict with any labels in the stencil_ir. kernel_copy.blocks = ir_utils.add_offset_to_labels( kernel_copy.blocks, stencil_stub_last_label) new_label = max(kernel_copy.blocks.keys()) + 1 # Adjust ret_blocks to account for addition of the offset. ret_blocks = [x + stencil_stub_last_label for x in ret_blocks] if config.DEBUG_ARRAY_OPT == 1: print("ret_blocks w/ offsets", ret_blocks, stencil_stub_last_label) print("before replace sentinel stencil_ir") ir_utils.dump_blocks(stencil_ir.blocks) print("before replace sentinel kernel_copy") ir_utils.dump_blocks(kernel_copy.blocks) # Search all the block in the stencil outline for the sentinel. for label, block in stencil_ir.blocks.items(): for i, inst in enumerate(block.body): if (isinstance(inst, ir.Assign) and inst.target.name == sentinel_name): # We found the sentinel assignment. loc = inst.loc scope = block.scope # split block across __sentinel__ # A new block is allocated for the statements prior to the # sentinel but the new block maintains the current block # label. prev_block = ir.Block(scope, loc) prev_block.body = block.body[:i] # The current block is used for statements after sentinel. block.body = block.body[i + 1:] # But the current block gets a new label. body_first_label = min(kernel_copy.blocks.keys()) # The previous block jumps to the minimum labelled block of # the parfor body. prev_block.append(ir.Jump(body_first_label, loc)) # Add all the parfor loop body blocks to the gufunc # function's IR. for (l, b) in kernel_copy.blocks.items(): stencil_ir.blocks[l] = b stencil_ir.blocks[new_label] = block stencil_ir.blocks[label] = prev_block # Add a jump from all the blocks that previously contained # a return in the stencil kernel to the block # containing statements after the sentinel. for ret_block in ret_blocks: stencil_ir.blocks[ret_block].append( ir.Jump(new_label, loc)) break else: continue break stencil_ir.blocks = ir_utils.rename_labels(stencil_ir.blocks) ir_utils.remove_dels(stencil_ir.blocks) assert (isinstance(the_array, types.Type)) array_types = args new_stencil_param_types = list(array_types) if config.DEBUG_ARRAY_OPT == 1: print("new_stencil_param_types", new_stencil_param_types) ir_utils.dump_blocks(stencil_ir.blocks) # Compile the combined stencil function with the replaced loop # body in it. new_func = compiler.compile_ir(self._typingctx, self._targetctx, stencil_ir, new_stencil_param_types, None, compiler.DEFAULT_FLAGS, {}) return new_func
def lstsq_impl(a, b, rcond=-1.0): ensure_lapack() _check_linalg_matrix(a, "lstsq") # B can be 1D or 2D. _check_linalg_1_or_2d_matrix(b, "lstsq") a_F_layout = a.layout == 'F' b_F_layout = b.layout == 'F' # the typing context is not easily accessible in `@overload` mode # so type unification etc. is done manually below a_np_dt = np_support.as_dtype(a.dtype) b_np_dt = np_support.as_dtype(b.dtype) np_shared_dt = np.promote_types(a_np_dt, b_np_dt) nb_shared_dt = np_support.from_dtype(np_shared_dt) # convert typing floats to np floats for use in the impl r_type = getattr(nb_shared_dt, "underlying_float", nb_shared_dt) if r_type.bitwidth == 32: real_dtype = np.float32 else: real_dtype = np.float64 # the lapack wrapper signature numba_ez_gelsd_sig = types.intc( types.char, # kind types.intp, # m types.intp, # n types.intp, # nrhs types.CPointer(nb_shared_dt), # a types.intp, # lda types.CPointer(nb_shared_dt), # b types.intp, # ldb types.CPointer(r_type), # S types.float64, # rcond types.CPointer(types.intc) # rank ) # the lapack wrapper function numba_ez_gelsd = types.ExternalFunction("numba_ez_gelsd", numba_ez_gelsd_sig) kind = ord(get_blas_kind(nb_shared_dt, "lstsq")) # The following functions select specialisations based on # information around 'b', a lot of this effort is required # as 'b' can be either 1D or 2D, and then there are # some optimisations available depending on real or complex # space. # get a specialisation for computing the number of RHS b_nrhs = _get_compute_nrhs(b) # get a specialised residual computation based on the dtype compute_res = _get_res_impl(nb_shared_dt, real_dtype, b) # b copy function b_copy_in = _get_copy_in_b_impl(b) # return blob function b_ret = _get_compute_return_impl(b) # check system is dimensionally valid function check_dimensionally_valid = _get_check_lstsq_dimensionally_valid_impl(a, b) def lstsq_impl(a, b, rcond=-1.0): n = a.shape[-1] m = a.shape[-2] nrhs = b_nrhs(b) # check the systems have no inf or NaN _check_finite_matrix(a) _check_finite_matrix(b) # check the systems is dimensionally valid check_dimensionally_valid(a, b) minmn = min(m, n) maxmn = max(m, n) # a is destroyed on exit, copy it acpy = a.astype(np_shared_dt) if a_F_layout: acpy = np.copy(acpy) else: acpy = np.asfortranarray(acpy) # b is overwritten on exit with the solution, copy allocate bcpy = np.empty((nrhs, maxmn), dtype=np_shared_dt).T # specialised copy in due to b being 1 or 2D b_copy_in(bcpy, b, nrhs) # Allocate returns s = np.empty(minmn, dtype=real_dtype) rank_ptr = np.empty(1, dtype=np.int32) r = numba_ez_gelsd( kind, # kind m, # m n, # n nrhs, # nrhs acpy.ctypes, # a m, # lda bcpy.ctypes, # a maxmn, # ldb s.ctypes, # s rcond, # rcond rank_ptr.ctypes # rank ) if r < 0: fatal_error_func() assert 0 # unreachable # set rank to that which was computed rank = rank_ptr[0] # compute residuals if rank < n or m <= n: res = np.empty((0), dtype=real_dtype) else: # this requires additional dispatch as there's a faster # impl if the result is in the real domain (no abs() required) res = compute_res(bcpy, n, nrhs) # extract 'x', the solution x = b_ret(bcpy, n) # help liveness analysis acpy.size bcpy.size s.size rank_ptr.size return (x, res, rank, s[:minmn]) return lstsq_impl
def min_dtype_int_val(dtype): numpy_dtype = numpy_support.as_dtype(dtype) return np.iinfo(numpy_dtype).min
def check(typestring, numba_type): # Only native ordering and alignment is supported dtype = np.dtype(typestring) self.assertEqual(numpy_support.from_dtype(dtype), numba_type) self.assertEqual(dtype, numpy_support.as_dtype(numba_type))
def check(typechar, numba_type): # Only native ordering and alignment is supported dtype = np.dtype(typechar) self.assertIs(f(dtype), numba_type) self.assertIs(f(np.dtype('=' + typechar)), numba_type) self.assertEqual(dtype, numpy_support.as_dtype(numba_type))
def check(base_inst, enum_def, type_class): np_dt = np.dtype(base_inst) nb_ty = numpy_support.from_dtype(np_dt) inst = type_class(enum_def, nb_ty) recovered = numpy_support.as_dtype(inst) self.assertEqual(np_dt, recovered)
def array(self, shape, dtype): dtype = numpy_support.as_dtype(dtype) return np.empty(shape, dtype)
def check(dtype, numba_type, code): tp = numpy_support.from_dtype(dtype) self.assertEqual(tp, numba_type) self.assertEqual(tp.unit_code, code) self.assertEqual(numpy_support.as_dtype(numba_type), dtype) self.assertEqual(numpy_support.as_dtype(tp), dtype)
def _stencil_wrapper(self, result, sigret, return_type, typemap, calltypes, *args): # Overall approach: # 1) Construct a string containing a function definition for the stencil function # that will execute the stencil kernel. This function definition includes a # unique stencil function name, the parameters to the stencil kernel, loop # nests across the dimenions of the input array. Those loop nests use the # computed stencil kernel size so as not to try to compute elements where # elements outside the bounds of the input array would be needed. # 2) The but of the loop nest in this new function is a special sentinel # assignment. # 3) Get the IR of this new function. # 4) Split the block containing the sentinel assignment and remove the sentinel # assignment. Insert the stencil kernel IR into the stencil function IR # after label and variable renaming of the stencil kernel IR to prevent # conflicts with the stencil function IR. # 5) Compile the combined stencil function IR + stencil kernel IR into existence. # Copy the kernel so that our changes for this callsite # won't effect other callsites. (kernel_copy, copy_calltypes) = self.copy_ir_with_calltypes( self.kernel_ir, calltypes) # The stencil kernel body becomes the body of a loop, for which args aren't needed. ir_utils.remove_args(kernel_copy.blocks) first_arg = kernel_copy.arg_names[0] in_cps, out_cps = ir_utils.copy_propagate(kernel_copy.blocks, typemap) name_var_table = ir_utils.get_name_var_table(kernel_copy.blocks) ir_utils.apply_copy_propagate( kernel_copy.blocks, in_cps, name_var_table, typemap, copy_calltypes) if "out" in name_var_table: raise ValueError("Cannot use the reserved word 'out' in stencil kernels.") sentinel_name = ir_utils.get_unused_var_name("__sentinel__", name_var_table) if config.DEBUG_ARRAY_OPT == 1: print("name_var_table", name_var_table, sentinel_name) the_array = args[0] if config.DEBUG_ARRAY_OPT == 1: print("_stencil_wrapper", return_type, return_type.dtype, type(return_type.dtype), args) ir_utils.dump_blocks(kernel_copy.blocks) # We generate a Numba function to execute this stencil and here # create the unique name of this function. stencil_func_name = "__numba_stencil_%s_%s" % ( hex(id(the_array)).replace("-", "_"), self.id) # We will put a loop nest in the generated function for each # dimension in the input array. Here we create the name for # the index variable for each dimension. index0, index1, ... index_vars = [] for i in range(the_array.ndim): index_var_name = ir_utils.get_unused_var_name("index" + str(i), name_var_table) index_vars += [index_var_name] # Create extra signature for out and neighborhood. out_name = ir_utils.get_unused_var_name("out", name_var_table) neighborhood_name = ir_utils.get_unused_var_name("neighborhood", name_var_table) sig_extra = "" if result is not None: sig_extra += ", {}=None".format(out_name) if "neighborhood" in dict(self.kws): sig_extra += ", {}=None".format(neighborhood_name) # Get a list of the standard indexed array names. standard_indexed = self.options.get("standard_indexing", []) if first_arg in standard_indexed: raise ValueError("The first argument to a stencil kernel must " "use relative indexing, not standard indexing.") if len(set(standard_indexed) - set(kernel_copy.arg_names)) != 0: raise ValueError("Standard indexing requested for an array name " "not present in the stencil kernel definition.") # Add index variables to getitems in the IR to transition the accesses # in the kernel from relative to regular Python indexing. Returns the # computed size of the stencil kernel and a list of the relatively indexed # arrays. kernel_size, relatively_indexed = self.add_indices_to_kernel( kernel_copy, index_vars, the_array.ndim, self.neighborhood, standard_indexed, typemap, copy_calltypes) if self.neighborhood is None: self.neighborhood = kernel_size if config.DEBUG_ARRAY_OPT == 1: print("After add_indices_to_kernel") ir_utils.dump_blocks(kernel_copy.blocks) # The return in the stencil kernel becomes a setitem for that # particular point in the iteration space. ret_blocks = self.replace_return_with_setitem(kernel_copy.blocks, index_vars, out_name) if config.DEBUG_ARRAY_OPT == 1: print("After replace_return_with_setitem", ret_blocks) ir_utils.dump_blocks(kernel_copy.blocks) # Start to form the new function to execute the stencil kernel. func_text = "def {}({}{}):\n".format(stencil_func_name, ",".join(kernel_copy.arg_names), sig_extra) # Get loop ranges for each dimension, which could be either int # or variable. In the latter case we'll use the extra neighborhood # argument to the function. ranges = [] for i in range(the_array.ndim): if isinstance(kernel_size[i][0], int): lo = kernel_size[i][0] hi = kernel_size[i][1] else: lo = "{}[{}][0]".format(neighborhood_name, i) hi = "{}[{}][1]".format(neighborhood_name, i) ranges.append((lo, hi)) # If there are more than one relatively indexed arrays, add a call to # a function that will raise an error if any of the relatively indexed # arrays are of different size than the first input array. if len(relatively_indexed) > 1: func_text += " raise_if_incompatible_array_sizes(" + first_arg for other_array in relatively_indexed: if other_array != first_arg: func_text += "," + other_array func_text += ")\n" # Get the shape of the first input array. shape_name = ir_utils.get_unused_var_name("full_shape", name_var_table) func_text += " {} = {}.shape\n".format(shape_name, first_arg) # If we have to allocate the output array (the out argument was not used) # then us numpy.full if the user specified a cval stencil decorator option # or np.zeros if they didn't to allocate the array. if result is None: return_type_name = numpy_support.as_dtype( return_type.dtype).type.__name__ if "cval" in self.options: cval = self.options["cval"] if return_type.dtype != typing.typeof.typeof(cval): raise ValueError( "cval type does not match stencil return type.") out_init ="{} = np.full({}, {}, dtype=np.{})\n".format( out_name, shape_name, cval, return_type_name) else: out_init ="{} = np.zeros({}, dtype=np.{})\n".format( out_name, shape_name, return_type_name) func_text += " " + out_init else: # result is present, if cval is set then use it if "cval" in self.options: cval = self.options["cval"] cval_ty = typing.typeof.typeof(cval) if not self._typingctx.can_convert(cval_ty, return_type.dtype): msg = "cval type does not match stencil return type." raise ValueError(msg) out_init = "{}[:] = {}\n".format(out_name, cval) func_text += " " + out_init offset = 1 # Add the loop nests to the new function. for i in range(the_array.ndim): for j in range(offset): func_text += " " # ranges[i][0] is the minimum index used in the i'th dimension # but minimum's greater than 0 don't preclude any entry in the array. # So, take the minimum of 0 and the minimum index found in the kernel # and this will be a negative number (potentially -0). Then, we do # unary - on that to get the positive offset in this dimension whose # use is precluded. # ranges[i][1] is the maximum of 0 and the observed maximum index # in this dimension because negative maximums would not cause us to # preclude any entry in the array from being used. func_text += ("for {} in range(-min(0,{})," "{}[{}]-max(0,{})):\n").format( index_vars[i], ranges[i][0], shape_name, i, ranges[i][1]) offset += 1 for j in range(offset): func_text += " " # Put a sentinel in the code so we can locate it in the IR. We will # remove this sentinel assignment and replace it with the IR for the # stencil kernel body. func_text += "{} = 0\n".format(sentinel_name) func_text += " return {}\n".format(out_name) if config.DEBUG_ARRAY_OPT == 1: print("new stencil func text") print(func_text) # Force the new stencil function into existence. exec_(func_text) in globals(), locals() stencil_func = eval(stencil_func_name) if sigret is not None: pysig = utils.pysignature(stencil_func) sigret.pysig = pysig # Get the IR for the newly created stencil function. stencil_ir = compiler.run_frontend(stencil_func) ir_utils.remove_dels(stencil_ir.blocks) # rename all variables in stencil_ir afresh var_table = ir_utils.get_name_var_table(stencil_ir.blocks) new_var_dict = {} reserved_names = ([sentinel_name, out_name, neighborhood_name, shape_name] + kernel_copy.arg_names + index_vars) for name, var in var_table.items(): if not name in reserved_names: new_var_dict[name] = ir_utils.mk_unique_var(name) ir_utils.replace_var_names(stencil_ir.blocks, new_var_dict) stencil_stub_last_label = max(stencil_ir.blocks.keys()) + 1 # Shift lables in the kernel copy so they are guaranteed unique # and don't conflict with any labels in the stencil_ir. kernel_copy.blocks = ir_utils.add_offset_to_labels( kernel_copy.blocks, stencil_stub_last_label) new_label = max(kernel_copy.blocks.keys()) + 1 # Adjust ret_blocks to account for addition of the offset. ret_blocks = [x + stencil_stub_last_label for x in ret_blocks] if config.DEBUG_ARRAY_OPT == 1: print("ret_blocks w/ offsets", ret_blocks, stencil_stub_last_label) print("before replace sentinel stencil_ir") ir_utils.dump_blocks(stencil_ir.blocks) print("before replace sentinel kernel_copy") ir_utils.dump_blocks(kernel_copy.blocks) # Search all the block in the stencil outline for the sentinel. for label, block in stencil_ir.blocks.items(): for i, inst in enumerate(block.body): if (isinstance( inst, ir.Assign) and inst.target.name == sentinel_name): # We found the sentinel assignment. loc = inst.loc scope = block.scope # split block across __sentinel__ # A new block is allocated for the statements prior to the # sentinel but the new block maintains the current block # label. prev_block = ir.Block(scope, loc) prev_block.body = block.body[:i] # The current block is used for statements after sentinel. block.body = block.body[i + 1:] # But the current block gets a new label. body_first_label = min(kernel_copy.blocks.keys()) # The previous block jumps to the minimum labelled block of # the parfor body. prev_block.append(ir.Jump(body_first_label, loc)) # Add all the parfor loop body blocks to the gufunc # function's IR. for (l, b) in kernel_copy.blocks.items(): stencil_ir.blocks[l] = b stencil_ir.blocks[new_label] = block stencil_ir.blocks[label] = prev_block # Add a jump from all the blocks that previously contained # a return in the stencil kernel to the block # containing statements after the sentinel. for ret_block in ret_blocks: stencil_ir.blocks[ret_block].append( ir.Jump(new_label, loc)) break else: continue break stencil_ir.blocks = ir_utils.rename_labels(stencil_ir.blocks) ir_utils.remove_dels(stencil_ir.blocks) assert(isinstance(the_array, types.Type)) array_types = args new_stencil_param_types = list(array_types) if config.DEBUG_ARRAY_OPT == 1: print("new_stencil_param_types", new_stencil_param_types) ir_utils.dump_blocks(stencil_ir.blocks) # Compile the combined stencil function with the replaced loop # body in it. new_func = compiler.compile_ir( self._typingctx, self._targetctx, stencil_ir, new_stencil_param_types, None, compiler.DEFAULT_FLAGS, {}) return new_func