def __init__(self, dtype_out, neutral, reduce_expr, map_expr=None, arguments=None, name="reduce_kernel", keep=False, options=None, preamble=""): ReductionKernel.__init__(self, dtype_out, neutral, reduce_expr, map_expr, arguments, name, keep, options, preamble) self.shared_size=self.block_size*self.dtype_out.itemsize
import pycuda.gpuarray as gpuarray import pycuda.driver as cuda import pycuda.autoinit import numpy from pycuda.reduction import ReductionKernel a = gpuarray.arange(400, dtype=numpy.float32) b = gpuarray.arange(400, dtype=numpy.float32) print a krnl = ReductionKernel(numpy.float32, neutral="0", reduce_expr="a+b", map_expr="x[i]*y[i]", arguments="float *x, float *y") my_dot_prod = krnl(a, b).get() print my_dot_prod
def test_struct_reduce(self): preamble = """ struct minmax_collector { float cur_min; float cur_max; __device__ minmax_collector() { } __device__ minmax_collector(float cmin, float cmax) : cur_min(cmin), cur_max(cmax) { } __device__ minmax_collector(minmax_collector const &src) : cur_min(src.cur_min), cur_max(src.cur_max) { } __device__ minmax_collector(minmax_collector const volatile &src) : cur_min(src.cur_min), cur_max(src.cur_max) { } __device__ minmax_collector volatile &operator=( minmax_collector const &src) volatile { cur_min = src.cur_min; cur_max = src.cur_max; return *this; } }; __device__ minmax_collector agg_mmc(minmax_collector a, minmax_collector b) { return minmax_collector( fminf(a.cur_min, b.cur_min), fmaxf(a.cur_max, b.cur_max)); } """ mmc_dtype = np.dtype([("cur_min", np.float32), ("cur_max", np.float32)]) from pycuda.curandom import rand as curand a_gpu = curand((20000,), dtype=np.float32) a = a_gpu.get() from pycuda.tools import register_dtype register_dtype(mmc_dtype, "minmax_collector") from pycuda.reduction import ReductionKernel red = ReductionKernel(mmc_dtype, neutral="minmax_collector(10000, -10000)", # FIXME: needs infinity literal in real use, ok here reduce_expr="agg_mmc(a, b)", map_expr="minmax_collector(x[i], x[i])", arguments="float *x", preamble=preamble) minmax = red(a_gpu).get() #print minmax["cur_min"], minmax["cur_max"] #print np.min(a), np.max(a) assert minmax["cur_min"] == np.min(a) assert minmax["cur_max"] == np.max(a)
def __init__(self, img_size, **kwargs): self.num = CorrelStage.num CorrelStage.num += 1 self.verbose = kwargs.get("verbose", 0) self.debug(2, "Initializing with resolution", img_size) self.h, self.w = img_size self._ready = False self.nbIter = kwargs.get("iterations", 5) self.showDiff = kwargs.get("show_diff", False) if self.showDiff: import cv2 cv2.namedWindow("Residual", cv2.WINDOW_NORMAL | cv2.WINDOW_KEEPRATIO) self.mul = kwargs.get("mul", 3) # These two store the values of the last resampled array # It is meant to allocate output array only once (see resampleD) self.rX, self.rY = -1, -1 # self.loop will be incremented every time getDisp is called # It will be used to measure performance and output some info self.loop = 0 # Allocating stuff # # Grid and block for kernels called with the size of the image # # All the images and arrays in the kernels will be in order (x,y) self.grid = (int(ceil(self.w / 32)), int(ceil(self.h / 32))) self.block = (int(ceil(self.w / self.grid[0])), int(ceil(self.h / self.grid[1])), 1) self.debug(3, "Default grid:", self.grid, "block", self.block) # We need the number of fields to allocate the G tables # self.Nfields = kwargs.get("Nfields") if self.Nfields is None: self.Nfields = len(kwargs.get("fields")[0]) # Allocating everything we need # self.devG = [] self.devFieldsX = [] self.devFieldsY = [] for i in range(self.Nfields): # devG stores the G arrays (to compute the research direction) self.devG.append(gpuarray.empty(img_size, np.float32)) # devFieldsX/Y store the fields value along X and Y self.devFieldsX.append(gpuarray.empty((self.h, self.w), np.float32)) self.devFieldsY.append(gpuarray.empty((self.h, self.w), np.float32)) # devH Stores the Hessian matrix self.H = np.zeros((self.Nfields, self.Nfields), np.float32) # And devHi stores its invert self.devHi = gpuarray.empty((self.Nfields, self.Nfields), np.float32) # devOut is written with the difference of the images self.devOut = gpuarray.empty((self.h, self.w), np.float32) # devX stores the value of the parameters (what is actually computed) self.devX = gpuarray.empty((self.Nfields), np.float32) # to store the research direction self.devVec = gpuarray.empty((self.Nfields), np.float32) # To store the original image on the device self.devOrig = gpuarray.empty(img_size, np.float32) # To store the gradient along X of the original image on the device self.devGradX = gpuarray.empty(img_size, np.float32) # And along Y self.devGradY = gpuarray.empty(img_size, np.float32) # Locating the kernel file # kernelFile = kwargs.get("kernel_file") if kernelFile is None: self.debug(2, "Kernel file not specified") from crappy import __path__ as crappyPath kernelFile = crappyPath[0] + "/data/kernels.cu" # Reading kernels and compiling module # with open(kernelFile, "r") as f: self.debug(3, "Sourcing module") self.mod = SourceModule(f.read() % (self.w, self.h, self.Nfields)) # Assigning functions to the kernels # # These kernels are defined in data/kernels.cu self._resampleOrigKrnl = self.mod.get_function('resampleO') self._resampleKrnl = self.mod.get_function('resample') self._gradientKrnl = self.mod.get_function('gradient') self._makeGKrnl = self.mod.get_function('makeG') self._makeDiff = self.mod.get_function('makeDiff') self._dotKrnl = self.mod.get_function('myDot') self._addKrnl = self.mod.get_function('kadd') # These ones use pyCuda reduction module to generate efficient kernels self._mulRedKrnl = ReductionKernel(np.float32, neutral="0", reduce_expr="a+b", map_expr="x[i]*y[i]", arguments="float *x, float *y") self._leastSquare = ReductionKernel(np.float32, neutral="0", reduce_expr="a+b", map_expr="x[i]*x[i]", arguments="float *x") # We could have used use mulRedKrnl(x,x), but this is probably faster ? # Getting texture references # self.tex = self.mod.get_texref('tex') self.tex_d = self.mod.get_texref('tex_d') self.texMask = self.mod.get_texref('texMask') # Setting proper flags # # All textures use normalized coordinates except for the mask for t in [self.tex, self.tex_d]: t.set_flags(cuda.TRSF_NORMALIZED_COORDINATES) for t in [self.tex, self.tex_d, self.texMask]: t.set_filter_mode(cuda.filter_mode.LINEAR) t.set_address_mode(0, cuda.address_mode.BORDER) t.set_address_mode(1, cuda.address_mode.BORDER) # Preparing kernels for less overhead when called # self._resampleOrigKrnl.prepare("Pii", texrefs=[self.tex]) self._resampleKrnl.prepare("Pii", texrefs=[self.tex_d]) self._gradientKrnl.prepare("PP", texrefs=[self.tex]) self._makeDiff.prepare("PPPP", texrefs=[self.tex, self.tex_d, self.texMask]) self._addKrnl.prepare("PfP") # Reading original image if provided # if kwargs.get("img") is not None: self.setOrig(kwargs.get("img")) # Reading fields if provided # if kwargs.get("fields") is not None: self.setFields(kwargs.get("fields")) # Reading mask if provided # if kwargs.get("mask") is not None: self.setMask(kwargs.get("mask"))
from pycuda.reduction import ReductionKernel import numpy dot = ReductionKernel(dtype_out=numpy.float32, neutral="0", reduce_expr="a+b", map_expr="x[i]∗y[i]", arguments="const float ∗x, const float ∗y") from pycuda.curandom import rand as curand x = curand((1000 * 1000), dtype=numpy.float32) y = curand((1000 * 1000), dtype=numpy.float32) x_dot_y = dot(x, y).get() x_dot_y_cpu = numpy.dot(x.get(), y.get()) print x_dot_y print x_dot_y_cpu
import pycuda.gpuarray as gpuarray import pycuda.autoinit import numpy from pycuda.reduction import ReductionKernel # Comprimento do vetor vector_length = 400 # Vetores A e B input_vector_a = gpuarray.arange(vector_length, dtype=numpy.int) input_vector_b = gpuarray.arange(vector_length, dtype=numpy.int) # Operação de redução em paralelo dot_product = ReductionKernel(numpy.int, arguments="int *x, int *y", map_expr="x[i]*y[i]", reduce_expr="a+b", neutral="0") # Execução do kernel dot_product = dot_product(input_vector_a, input_vector_b).get() # Imprime os resultados print("Matriz A") print(input_vector_a) print("Matriz B") print(input_vector_b) print("Resultado do Produto A * B") print(dot_product)
import pycuda.gpuarray as gpuarray import pycuda.driver as drv import pycuda.autoinit import numpy from pycuda.reduction import ReductionKernel n = 5 start = drv.Event() end = drv.Event() start.record() d_a = gpuarray.arange(n, dtype=numpy.uint32) d_b = gpuarray.arange(n, dtype=numpy.uint32) # 归约内核函数 kernel = ReductionKernel(numpy.uint32, neutral="0", reduce_expr="a+b", map_expr="d_a[i]*d_b[i]", arguments="int *d_a,int *d_b") # 点乘 d_result = kernel(d_a, d_b).get() end.record() end.synchronize() secs = start.time_till(end) * 1e-3 print("Vector A") print(d_a) print("Vector B") print(d_b) print("The computed dot product using reduction:") print(d_result) print("Dot Product on GPU") print("%fs" % (secs))
mod = SourceModule(""" __global__ void CopyRow(float *x, float *y, int n) { for (int j = threadIdx.x + blockIdx.x*blockDim.x; j < n; j += blockDim.x * gridDim.x) { x[j] = y[j]; } } """) CopyRow = mod.get_function("CopyRow") compareGpu = ReductionKernel(np.float32, neutral="0", reduce_expr="max(a, b)", map_expr="abs( abs(x[i]) - abs(y[i]) )", arguments="float *x, float *y") def gpuMatMul(A, B, C, transa='n', transb='n', block=(32, 32, 1)): block = (TileMat, TileMat, 1) transa = transa.lower() transb = transb.lower() bx, by, bz = block Arow, Acol = A.shape Brow, Bcol = B.shape if bx > Tile: bx = Tile if by > Tile:
calc_ll_by_der_per_sample = ElementwiseKernel( "double *err_by_der, double sigma_lm, double *ll_by_der ", "ll_by_der[i] = -0.5 * err_by_der[i]*err_by_der[i] / (sigma_lm*sigma_lm)", "calc_ll_by_der_per_sample") from pycuda import autoinit #calc_diff = ElementwiseKernel( # "double *x, double *out, int N", # """out[i] = (x[i+1]-x[i])/dt - (y[i+1]-y[i])/dt""", # "calc_err_by_der_per_sample") calc_sum_prime = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr=" i < N-1 ? x[i+1]-x[i] : 0", arguments="double *x, int N") calc_sum_double_prime = ReductionKernel( np.float64, neutral="0", reduce_expr="a+b", map_expr=" ((0 < i) && (i < N-1)) ? x[i+1]-2*x[i]+x[i-1] : 0", arguments="double *x, int N") calc_sum_abs_double_prime = ReductionKernel( np.float64, neutral="0", reduce_expr="a+b", map_expr=" ((0 < i) && (i < N-1)) ? abs(x[i+1]-2*x[i]+x[i-1]) : 0",
seq = np.array([1, 2, 3, 4], dtype=np.int32) seq_gpu = gpuarray.to_gpu(seq) sum_gpu = InclusiveScanKernel(np.int32, "a+b") # analogous to lambda a,b: a + b print sum_gpu(seq_gpu).get() print np.cumsum(seq) seq = np.array([1, 100, -3, -10000, 4, 10000, 66, 14, 21], dtype=np.int32) seq_gpu = gpuarray.to_gpu(seq) max_gpu = InclusiveScanKernel(np.int32, "a > b ? a : b") print max_gpu(seq_gpu).get() print max_gpu(seq_gpu).get()[-1] # print np.max(seq) auch 10000 # Skalarprodukt einer Matrix in paralleler Ausfueugrung auf der gpu, # kann erstmal nur einfache Vektoren, keine 2D-Matrizen dot_prod = ReductionKernel(np.float32, neutral="0", reduce_expr="a+b", map_expr="vec1[i]*vec2[i]", arguments="float *vec1, float *vec2") x = np.array([1, 2, 3]).astype(np.float32) y = np.array([6, 7, 8]).astype(np.float32) device_x = gpuarray.to_gpu(x) device_y = gpuarray.to_gpu(y) product = dot_prod(device_x, device_y) print(product.get())
from pycuda import gpuarray as ga from pycuda import driver from pycuda.elementwise import ElementwiseKernel from pycuda.reduction import ReductionKernel import numpy as np _axby = ElementwiseKernel( """ pycuda::complex<double> a, pycuda::complex<double> *x, pycuda::complex<double> b, pycuda::complex<double> *y""", \ ' x[i] = a * x[i] + b * y[i]') _norm = ReductionKernel(np.complex128, neutral="0", reduce_expr="a+b", map_expr="pow(abs(x[i]), 2)", arguments="pycuda::complex<double> *x") class Grid: def __init__(self, array): # Get the array. if type(array) is np.ndarray: self.g = ga.to_gpu(array) # Copy data to the GPU. elif type(array) is ga.GPUArray: self.g = array # GPUArray already initialized. else: print 'Invalid type' # Raise proper exception here. # # Create the aby function. # if self.g.dtype is np.dtype('complex128'): # cuda_type = 'pycuda::complex<double>'
import pycuda.autoinit import pycuda.driver as cuda import pycuda.gpuarray as gpuarray from pycuda.reduction import ReductionKernel from pycuda.curandom import rand as curand import numpy a = curand((1000 * 1000), dtype=numpy.float32) b = curand((1000 * 1000), dtype=numpy.float32) piKernel = ReductionKernel(numpy.float32, neutral="0", reduce_expr="a+b", map_expr="float(x[i] * x[i] + y[i] * y[i]) <= 1.0f", arguments="float *x, float*y") pi = (4.0 * piKernel(a, b).get()) / (1000 * 1000) print(pi)
''' size = 5 knl = ReductionKernel(dtype_out = np.float32, neutral = "0", reduce_expr = "a+b", map_expr = "x[i]",arguments = "float *x") a = np.random.randint(5, size = size).astype(np.float32) a_gpu = gpuarray.to_gpu(a) result_gpu = knl(a_gpu) print a print "\n" print reduction_cpu(a, size) print "\n" print result_gpu.get() ''' knl = ReductionKernel(dtype_out = np.float32, neutral = "0", reduce_expr = "a+b", map_expr = "x[i]",arguments = "float *x") time_cpu = [] time_knl = [] N = range(1, 3000) for i in N: size = 32 * i a = np.random.randint(5, size = size).astype(np.float32) a_gpu = gpuarray.to_gpu(a) start = time.time() reduction_cpu(a, size) time_cpu.append(time.time() - start) start = time.time()
"sigmoid_double") tanh_float_ker = ElementwiseKernel(f"float *Y, float *x", """ Y[i] = tanh(x[i]) """, "tanh_float") tanh_double_ker = ElementwiseKernel( f"double *Y, double *x", """ double pos_exp = exp (x[i]); double neg_exp = exp (-x[i]); Y[i] = (pos_exp - neg_exp) / (pos_exp + neg_exp) """, "tanh_double") exp_sum_float_ker = ReductionKernel(np.float32, neutral="0.0", reduce_expr="a+b", map_expr="exp (x[i])", arguments=f"float *x") softmax_float_ker = ElementwiseKernel(f"float *Y, float *x, float s", "Y[i] = exp (x[i]) / s", "softmax_float") exp_sum_double_ker = ReductionKernel(np.float32, neutral="0.0", reduce_expr="a+b", map_expr="exp (x[i])", arguments=f"double *x") softmax_double_ker = ElementwiseKernel(f"double *Y, double *x, double s", "Y[i] = exp (x[i]) / s", "softmax_double")
# # The utility functions for GPU computation # import numpy as np from ..util import gpu_init try: from pycuda.reduction import ReductionKernel from pycuda.elementwise import ElementwiseKernel # log|A| for A is a low triangle matrix # logDiagSum(A, A.shape[0]+1) logDiagSum = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="i%step==0?log(x[i]):0", arguments="double *x, int step") strideSum = ReductionKernel(np.float64, neutral="0", reduce_expr="a+b", map_expr="i%step==0?x[i]:0", arguments="double *x, int step") # np.trace(np.dot(A,B)) (also equivalent to (A*B.T).sum() ) A - a1 x a2, B - a2 x a1 traceDot = ReductionKernel( np.float64, neutral="0", reduce_expr="a+b", map_expr="A[i]*B[(i%a1)*a2+i/a1]",