def get_if_positive_kernel(crit_dtype, dtype): return get_elwise_kernel([ VectorArg(crit_dtype, "crit"), VectorArg(dtype, "then_"), VectorArg(dtype, "else_"), VectorArg(dtype, "result"), ], "result[i] = crit[i] > 0 ? then_[i] : else_[i]", "if_positive")
def get_take_kernel(dtype, idx_dtype, vec_count=1): ctx = { "idx_tp": dtype_to_ctype(idx_dtype), "tp": dtype_to_ctype(dtype), "tex_tp": dtype_to_ctype(dtype, with_fp_tex_hack=True), } args = ( [VectorArg(idx_dtype, "idx")] + [VectorArg(dtype, "dest" + str(i)) for i in range(vec_count)] + [ScalarArg(np.intp, "n")] ) preamble = "#include <pycuda-helpers.hpp>\n\n" + "\n".join( "texture <%s, 1, cudaReadModeElementType> tex_src%d;" % (ctx["tex_tp"], i) for i in range(vec_count) ) body = ("%(idx_tp)s src_idx = idx[i];\n" % ctx) + "\n".join( "dest%d[i] = fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i) for i in range(vec_count) ) mod = get_elwise_module(args, body, "take", preamble=preamble) func = mod.get_function("take") tex_src = [mod.get_texref("tex_src%d" % i) for i in range(vec_count)] func.prepare("P" + (vec_count * "P") + np.dtype(np.uintp).char, texrefs=tex_src) return func, tex_src
def get_take_put_kernel(dtype, idx_dtype, with_offsets, vec_count=1): ctx = { "idx_tp": dtype_to_ctype(idx_dtype), "tp": dtype_to_ctype(dtype), "tex_tp": dtype_to_ctype(dtype, with_fp_tex_hack=True), } args = ( [ VectorArg(idx_dtype, "gmem_dest_idx"), VectorArg(idx_dtype, "gmem_src_idx"), ] + [VectorArg(dtype, "dest%d" % i) for i in range(vec_count)] + [ ScalarArg(idx_dtype, "offset%d" % i) for i in range(vec_count) if with_offsets ] + [ScalarArg(np.intp, "n")] ) preamble = "#include <pycuda-helpers.hpp>\n\n" + "\n".join( "texture <%s, 1, cudaReadModeElementType> tex_src%d;" % (ctx["tex_tp"], i) for i in range(vec_count) ) if with_offsets: def get_copy_insn(i): return ( "dest%d[dest_idx] = " "fp_tex1Dfetch(tex_src%d, src_idx+offset%d);" % (i, i, i) ) else: def get_copy_insn(i): return "dest%d[dest_idx] = " "fp_tex1Dfetch(tex_src%d, src_idx);" % (i, i) body = ( "%(idx_tp)s src_idx = gmem_src_idx[i];\n" "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx ) + "\n".join(get_copy_insn(i) for i in range(vec_count)) mod = get_elwise_module(args, body, "take_put", preamble=preamble) func = mod.get_function("take_put") tex_src = [mod.get_texref("tex_src%d" % i) for i in range(vec_count)] func.prepare( "PP" + (vec_count * "P") + (bool(with_offsets) * vec_count * idx_dtype.char) + np.dtype(np.uintp).char, texrefs=tex_src, ) return func, tex_src
def get_linear_combination_kernel(summand_descriptors, dtype_z): from pycuda.tools import dtype_to_ctype from pycuda.elementwise import VectorArg, ScalarArg, get_elwise_module args = [] preamble = ["#include <pycuda-helpers.hpp>\n\n"] loop_prep = [] summands = [] tex_names = [] for i, (is_gpu_scalar, scalar_dtype, vector_dtype) in enumerate( summand_descriptors ): if is_gpu_scalar: preamble.append( "texture <%s, 1, cudaReadModeElementType> tex_a%d;" % (dtype_to_ctype(scalar_dtype, with_fp_tex_hack=True), i) ) args.append(VectorArg(vector_dtype, "x%d" % i)) tex_names.append("tex_a%d" % i) loop_prep.append( "%s a%d = fp_tex1Dfetch(tex_a%d, 0)" % (dtype_to_ctype(scalar_dtype), i, i) ) else: args.append(ScalarArg(scalar_dtype, "a%d" % i)) args.append(VectorArg(vector_dtype, "x%d" % i)) summands.append("a%d*x%d[i]" % (i, i)) args.append(VectorArg(dtype_z, "z")) args.append(ScalarArg(np.uintp, "n")) mod = get_elwise_module( args, "z[i] = " + " + ".join(summands), "linear_combination", preamble="\n".join(preamble), loop_prep=";\n".join(loop_prep), ) func = mod.get_function("linear_combination") tex_src = [mod.get_texref(tn) for tn in tex_names] func.prepare("".join(arg.struct_char for arg in args), texrefs=tex_src) return func, tex_src
def get_put_kernel(dtype, idx_dtype, vec_count=1): ctx = { "idx_tp": dtype_to_ctype(idx_dtype), "tp": dtype_to_ctype(dtype), } args = ([ VectorArg(idx_dtype, "gmem_dest_idx"), ] + [VectorArg(dtype, "dest%d" % i) for i in range(vec_count)] + [VectorArg(dtype, "src%d" % i) for i in range(vec_count)] + [ScalarArg(np.intp, "n")]) body = "%(idx_tp)s dest_idx = gmem_dest_idx[i];\n" % ctx + "\n".join( "dest%d[dest_idx] = src%d[i];" % (i, i) for i in range(vec_count)) func = get_elwise_module(args, body, "put").get_function("put") func.prepare("P" + (2 * vec_count * "P") + np.dtype(np.uintp).char) return func