def svm(scores, labels, mode, error=None, allocator=memPool): assert scores.dtype == np.float32 and labels.dtype == np.int32 shape = scores.shape grad = GPUArray.empty(shape, dtype=np.float32, allocator=allocator) if error is None: error = GPUArray.empty((), dtype=np.float32, allocator=allocator) error.fill(0.0) size = prod(scores.shape) spatialDim = prod(scores.shape[2:]) mapStride = spatialDim * scores.shape[1] block = (nthreads, 1, 1) grid = (roundUpDiv(size, nthreads), 1, 1) mod = {"l1": svmL1Mod, "l2": svmL2Mod}[mode] mod.cost(scores, labels, np.int32(size), np.int32(mapStride), np.int32(spatialDim), np.int32(shape[1]), np.int32(shape[0]), error, grad, block=block, grid=grid) return error, grad
def matvec(mat, vec, axis=0, out=None, alpha=1.0, beta=0.0, allocator=memPool): assert vec.dtype == mat.dtype and (mat.dtype == np.float32 or mat.dtype == np.float16) assert vec.ndim == mat.ndim - 1 and 0 <= axis < 2 h, w = mat.shape[-2:] if axis == 1: assert mat.dimAt(-1) == vec.dimAt(-1) block = (warpSize, 1, 1) grid = (h, 1, prod(mat.shape[:-2])) if out is None: out = GPUArray.zeros(mat.shape[:-1], dtype=mat.dtype, allocator=allocator) else: assert out.shape == mat.shape[:-1] fn = mulmod.vecMulOnRow if mat.dtype == np.float32 else mulmod.vecMulOnRowFP16 fn(out, mat, vec, np.int32(w), np.int32(h), np.float32(alpha), np.float32(beta), block=block, grid=grid) else: block = (NT, 1, 1) grid = (roundUpDiv(w, block[0]), 1, prod(mat.shape[:-2])) if out is None: out = GPUArray.zeros(mat.shape[:-2] + (w, ), dtype=mat.dtype, allocator=allocator) else: assert out.shape == mat.shape[:-2] + (w, ) fn = mulmod.vecMulOnCol if mat.dtype == np.float32 else mulmod.vecMulOnColFP16 fn(out, mat, vec, np.int32(w), np.int32(h), np.float32(alpha), np.float32(beta), block=block, grid=grid) return out
def crossEntropy(scores, labels, weights=None, error=None, allocator=memPool): assert scores.dtype == np.float32 and labels.dtype == np.int32 shape = scores.shape if scores.ndim < 4: scores = scores.reshape(*shape, *(1 for _ in range(4 - scores.ndim))) softmax = cudnn.softmaxNd(scores, mode=SoftMaxMode.spatial.value, allocator=allocator) grad = GPUArray.empty(shape, dtype=np.float32, allocator=allocator) if error is None: error = GPUArray.empty((), dtype=np.float32, allocator=allocator) error.fill(0.0) size = prod(scores.shape) spatialDim = prod(scores.shape[2:]) mapStride = spatialDim * scores.shape[1] block = (nthreads, 1, 1) grid = (roundUpDiv(size, nthreads), 1, 1) if weights is None: ceMod.cost(softmax, labels, np.int32(size), np.int32(mapStride), np.int32(spatialDim), np.int32(scores.shape[1]), np.int32(scores.shape[0]), error, grad, block=block, grid=grid) else: wceMod.cost(softmax, labels, weights, np.int32(size), np.int32(mapStride), np.int32(spatialDim), np.int32(shape[1]), np.int32(shape[0]), error, grad, block=block, grid=grid) return error, grad
def matsum(tensor, axis=0, out=None, alpha=1.0, beta=0.0, allocator=memPool): assert tensor.dtype == np.float32 or tensor.dtype == np.float16 assert 0 <= axis < tensor.ndim if axis == tensor.ndim - 1: block = (warpSize, 1, 1) grid = (prod(tensor.shape[:-1]), 1, 1) if out is None: out = GPUArray.zeros(tensor.shape[:-1], dtype=tensor.dtype, allocator=allocator) else: assert out.shape == tensor.shape[:-1] fn = summod.sumOnRow if tensor.dtype == np.float32 else summod.sumOnRowFP16 fn(out, tensor, np.int32(tensor.dimAt(-1)), np.float32(alpha), np.float32(beta), block=block, grid=grid) else: z, width = prod(tensor.shape[:axis]), prod(tensor.shape[axis + 1:]) block = (NT, 1, 1) grid = (roundUpDiv(width, block[0]), 1, z) if out is None: out = GPUArray.zeros(tensor.shape[:axis] + tensor.shape[axis + 1:], dtype=tensor.dtype, allocator=allocator) else: assert out.shape == tensor.shape[:axis] + tensor.shape[axis + 1:] fn = summod.sumOnCol if tensor.dtype == np.float32 else summod.sumOnColFP16 fn(out, tensor, np.int32(width), np.int32(tensor.dimAt(axis)), np.float32(alpha), np.float32(beta), block=block, grid=grid) return out
def maxunpool2dBackward(grad, poolshape, mask, allocator=memPool): assert grad.dtype == np.float32 and mask.dtype == np.int32 batchsize, maps, outh, outw = grad.shape inh, inw = poolshape[2], poolshape[3] ingrad = GPUArray.empty((batchsize, maps, inh, inw), dtype=np.float32, allocator=allocator) size = prod(ingrad.shape) block = (nthreads, 1, 1) grid = (roundUpDiv(size, nthreads), 1, 1) mod.maxunpool2dBackward(ingrad, grad, mask, np.int32(inh), np.int32(inw), np.int32(outh), np.int32(outw), np.int32(maps), np.int32(size), block=block, grid=grid) return ingrad
def maxunpool2d(data, origshape, mask, allocator=memPool): assert data.dtype == np.float32 batchsize, maps, inh, inw = data.shape outh, outw = origshape[2], origshape[3] outdata = GPUArray.zeros((batchsize, maps, outh, outw), dtype=np.float32, allocator=allocator) size = prod(data.shape) block = (nthreads, 1, 1) grid = (roundUpDiv(size, nthreads), 1, 1) mod.maxunpool2d(outdata, data, mask, np.int32(inh), np.int32(inw), np.int32(outh), np.int32(outw), np.int32(maps), np.int32(size), block=block, grid=grid) return outdata
def preluBackwardParams(indata, outgrad, sharedMaps=False, allocator=memPool): assert indata.dtype == outgrad.dtype and outgrad.dtype == np.float32 assert indata.shape == outgrad.shape size = prod(outgrad.shape[1:]) stride = prod(outgrad.shape[1:]) block = (nthreads, 1, 1) grid = (roundUpDiv(size, nthreads), 1, 1) slopegrad = GPUArray.empty(outgrad.shape[1:], dtype=np.float32, allocator=allocator) mod.preluBackwardParams( slopegrad, outgrad, indata, np.int32(outgrad.shape[0]), np.int32(stride), np.int32(size), block=block, grid=grid ) shape = (1, prod(slopegrad.shape)) if sharedMaps else (slopegrad.shape[0], prod(slopegrad.shape[1:])) return matsum(slopegrad.reshape(shape), axis=1)
def prelu(data, slopes, inplace=False, sharedMaps=False, allocator=memPool): assert data.dtype == slopes.dtype and slopes.dtype == np.float32 assert slopes.shape == (1, ) if sharedMaps else data.shape[1] == slopes.shape[0] outdata = data if inplace else GPUArray.empty(data.shape, dtype=np.float32, allocator=allocator) mapsize = prod(data.shape[2:]) size = prod(data.shape) block = (nthreads, 1, 1) grid = (roundUpDiv(size, nthreads), 1, 1) divFactor = data.shape[1] if sharedMaps else 1 mod.prelu( outdata, data, slopes, np.int32(divFactor), np.int32(mapsize), np.int32(data.shape[1]), np.int32(size), block=block, grid=grid ) return outdata
def preluBackwardData(grad, slopes, indata, sharedMaps=False, allocator=memPool): assert grad.dtype == slopes.dtype and slopes.dtype == indata.dtype and indata.dtype == np.float32 assert grad.shape == indata.shape assert slopes.shape == (1, ) if sharedMaps else grad.shape[1] == slopes.shape[0] ingrad = GPUArray.empty(grad.shape, dtype=np.float32, allocator=allocator) mapsize = prod(grad.shape[2:]) size = prod(grad.shape) block = (nthreads, 1, 1) grid = (roundUpDiv(size, nthreads), 1, 1) divFactor = grad.shape[1] if sharedMaps else 1 mod.preluBackwardData( ingrad, grad, slopes, indata, np.int32(divFactor), np.int32(mapsize), np.int32(grad.shape[1]), np.int32(size), block=block, grid=grid ) return ingrad
def argminmax(tensor, axis, mode, allocator): assert tensor.dtype == np.float32 or tensor.dtype == np.float16 assert 0 <= axis < tensor.ndim mod = {"max": maxmod, "min": minmod}[mode] if axis == tensor.ndim - 1: block = (warpSize, 1, 1) grid = (prod(tensor.shape[:-1]), 1, 1) idx = GPUArray.empty(tensor.shape[:-1], dtype=np.int32, allocator=allocator) fn = mod.minMaxOnRow if tensor.dtype == np.float32 else mod.minMaxOnRowFP16 fn(idx, tensor, np.int32(tensor.dimAt(-1)), block=block, grid=grid) else: z, width = prod(tensor.shape[:axis]), prod(tensor.shape[axis + 1:]) block = (NT, 1, 1) grid = (roundUpDiv(width, block[0]), 1, z) idx = GPUArray.empty(tensor.shape[:axis] + tensor.shape[axis + 1:], dtype=np.int32, allocator=allocator) fn = mod.minMaxOnCol if tensor.dtype == np.float32 else mod.minMaxOnColFP16 fn(idx, tensor, np.int32(width), np.int32(tensor.dimAt(axis)), block=block, grid=grid) return idx
def getRnnParam(rnn, W, layer, linLayer, Wshape): Wtuple, biasTuple = rnn.getParam(W, layer, linLayer) Woffset, wsize = Wtuple biasOffset, biasSize = biasTuple dtype, gpudata = W.dtype, W.gpudata Wbytes, biasBytes = wsize * dtype.itemsize, biasSize * dtype.itemsize assert prod(Wshape) == wsize w = GPUArray(Wshape, dtype=W.dtype, gpudata=W.gpudata[Woffset:Woffset + Wbytes]) bias = GPUArray((biasSize, ), dtype=W.dtype, gpudata=W.gpudata[biasOffset:biasOffset + biasBytes]) return w, bias
def maxpool2dBackward(grad, origshape, mask, size, stride, pad, allocator=memPool): assert grad.dtype == np.float32 and mask.dtype == np.int32 batchsize, maps, outh, outw = grad.shape fh, fw = size hstride, wstride = stride hpad, wpad = pad inh, inw = origshape[2], origshape[3] ingrad = GPUArray.empty((batchsize, maps, inh, inw), dtype=np.float32, allocator=allocator) size = prod(ingrad.shape) block = (nthreads, 1, 1) grid = (roundUpDiv(size, nthreads), 1, 1) mod.maxpool2dBackward(ingrad, grad, mask, np.int32(inh), np.int32(inw), np.int32(outh), np.int32(outw), np.int32(maps), np.int32(hstride), np.int32(wstride), np.int32(hpad), np.int32(wpad), np.int32(fh), np.int32(fw), np.int32(size), block=block, grid=grid) return ingrad
def maxpool2d(data, size, stride, pad, allocator=memPool): assert data.dtype == np.float32 batchsize, maps, inh, inw = data.shape fh, fw = size hstride, wstride = stride hpad, wpad = pad outh = (inh - fh + 2 * hpad) // hstride + 1 outw = (inw - fw + 2 * wpad) // wstride + 1 outdata = GPUArray.empty((batchsize, maps, outh, outw), dtype=np.float32, allocator=allocator) mask = GPUArray.empty((batchsize, maps, outh, outw), dtype=np.int32, allocator=allocator) size = prod(outdata.shape) block = (nthreads, 1, 1) grid = (roundUpDiv(size, nthreads), 1, 1) mod.maxpool2d(outdata, data, mask, np.int32(inh), np.int32(inw), np.int32(outh), np.int32(outw), np.int32(maps), np.int32(hstride), np.int32(wstride), np.int32(hpad), np.int32(wpad), np.int32(fh), np.int32(fw), np.int32(size), block=block, grid=grid) return outdata, mask
def addVecToMat(vec, mat, axis=0, out=None, allocator=memPool): assert vec.dtype == mat.dtype and (mat.dtype == np.float32 or mat.dtype == np.float16) assert vec.ndim == mat.ndim - 1 and 0 <= axis < 2 assert mat.shape[:-2] == vec.shape[:-1] out = GPUArray.empty(mat.shape, dtype=mat.dtype, allocator=allocator) if out is None else out z = prod(mat.shape[:-2]) n, m = mat.shape[-2:] block = (warpSize, warpSize, 1) grid = (roundUpDiv(m, block[0]), roundUpDiv(n, block[1]), z) if axis == 1: if mat.dimAt(-1) == vec.dimAt(-1): fn = addmod.opRowVecToMat if mat.dtype == np.float32 else addmod.opRowVecToMatFP16 fn(out, vec, mat, np.int32(n), np.int32(m), block=block, grid=grid) else: assert mat.dimAt(-1) % vec.dimAt(-1) == 0 fn = addmod.opRowOneVecToMat if mat.dtype == np.float32 else addmod.opRowOneVecToMatFP16 fn(out, vec, mat, np.int32(n), np.int32(m), np.int32(vec.dimAt(-1)), block=block, grid=grid) else: fn = addmod.opColVecToMat if mat.dtype == np.float32 else addmod.opColVecToMatFP16 fn(out, vec, mat, np.int32(n), np.int32(m), block=block, grid=grid) return out