示例#1
0
def matvec(mat, vec, axis=0, out=None, alpha=1.0, beta=0.0, allocator=memPool):
    assert vec.dtype == mat.dtype and (mat.dtype == np.float32
                                       or mat.dtype == np.float16)
    assert vec.ndim == mat.ndim - 1 and 0 <= axis < 2

    h, w = mat.shape[-2:]

    if axis == 1:
        assert mat.dimAt(-1) == vec.dimAt(-1)

        block = (warpSize, 1, 1)
        grid = (h, 1, prod(mat.shape[:-2]))

        if out is None:
            out = GPUArray.zeros(mat.shape[:-1],
                                 dtype=mat.dtype,
                                 allocator=allocator)
        else:
            assert out.shape == mat.shape[:-1]

        fn = mulmod.vecMulOnRow if mat.dtype == np.float32 else mulmod.vecMulOnRowFP16
        fn(out,
           mat,
           vec,
           np.int32(w),
           np.int32(h),
           np.float32(alpha),
           np.float32(beta),
           block=block,
           grid=grid)

    else:
        block = (NT, 1, 1)
        grid = (roundUpDiv(w, block[0]), 1, prod(mat.shape[:-2]))

        if out is None:
            out = GPUArray.zeros(mat.shape[:-2] + (w, ),
                                 dtype=mat.dtype,
                                 allocator=allocator)
        else:
            assert out.shape == mat.shape[:-2] + (w, )

        fn = mulmod.vecMulOnCol if mat.dtype == np.float32 else mulmod.vecMulOnColFP16
        fn(out,
           mat,
           vec,
           np.int32(w),
           np.int32(h),
           np.float32(alpha),
           np.float32(beta),
           block=block,
           grid=grid)

    return out
示例#2
0
文件: Pad.py 项目: rsarbaev/PuzzleLib
def reflectpadBackward(grad, pad, allocator=memPool):
    if grad.ndim == 3:
        batchsize, maps, outsize = grad.shape
        lpad, rpad = pad

        block = (warpSize, 1, 1)
        grid = (roundUpDiv(outsize, warpSize), maps, batchsize)

        insize = outsize - lpad - rpad
        ingrad = GPUArray.zeros((batchsize, maps, insize),
                                dtype=grad.dtype,
                                allocator=allocator)
        fn = mod.reflectpad1dBackward if grad.dtype == np.float32 else mod.reflectpad1dBackwardFP16

        fn(ingrad,
           grad,
           np.int32(insize),
           np.int32(lpad),
           np.int32(rpad),
           block=block,
           grid=grid)

    elif grad.ndim == 4:
        batchsize, maps, outh, outw = grad.shape
        upad, bpad, lpad, rpad = pad

        inh, inw = outh - upad - bpad, outw - lpad - rpad

        block = (warpSize, 1, 1)
        grid = (roundUpDiv(outh * outw, warpSize), maps, batchsize)

        ingrad = GPUArray.zeros((batchsize, maps, inh, inw),
                                dtype=grad.dtype,
                                allocator=allocator)
        fn = mod.reflectpad2dBackward if grad.dtype == np.float32 else mod.reflectpad2dBackwardFP16

        fn(ingrad,
           grad,
           np.int32(inh),
           np.int32(inw),
           np.int32(upad),
           np.int32(bpad),
           np.int32(lpad),
           np.int32(rpad),
           block=block,
           grid=grid)

    else:
        raise NotImplementedError(grad.ndim)

    return ingrad
示例#3
0
def matsum(tensor, axis=0, out=None, alpha=1.0, beta=0.0, allocator=memPool):
    assert tensor.dtype == np.float32 or tensor.dtype == np.float16
    assert 0 <= axis < tensor.ndim

    if axis == tensor.ndim - 1:
        block = (warpSize, 1, 1)
        grid = (prod(tensor.shape[:-1]), 1, 1)

        if out is None:
            out = GPUArray.zeros(tensor.shape[:-1],
                                 dtype=tensor.dtype,
                                 allocator=allocator)
        else:
            assert out.shape == tensor.shape[:-1]

        fn = summod.sumOnRow if tensor.dtype == np.float32 else summod.sumOnRowFP16
        fn(out,
           tensor,
           np.int32(tensor.dimAt(-1)),
           np.float32(alpha),
           np.float32(beta),
           block=block,
           grid=grid)

    else:
        z, width = prod(tensor.shape[:axis]), prod(tensor.shape[axis + 1:])

        block = (NT, 1, 1)
        grid = (roundUpDiv(width, block[0]), 1, z)

        if out is None:
            out = GPUArray.zeros(tensor.shape[:axis] + tensor.shape[axis + 1:],
                                 dtype=tensor.dtype,
                                 allocator=allocator)
        else:
            assert out.shape == tensor.shape[:axis] + tensor.shape[axis + 1:]

        fn = summod.sumOnCol if tensor.dtype == np.float32 else summod.sumOnColFP16
        fn(out,
           tensor,
           np.int32(width),
           np.int32(tensor.dimAt(axis)),
           np.float32(alpha),
           np.float32(beta),
           block=block,
           grid=grid)

    return out
示例#4
0
def maxunpool2d(data, origshape, mask, allocator=memPool):
    assert data.dtype == np.float32
    batchsize, maps, inh, inw = data.shape

    outh, outw = origshape[2], origshape[3]
    outdata = GPUArray.zeros((batchsize, maps, outh, outw),
                             dtype=np.float32,
                             allocator=allocator)

    size = prod(data.shape)

    block = (nthreads, 1, 1)
    grid = (roundUpDiv(size, nthreads), 1, 1)

    mod.maxunpool2d(outdata,
                    data,
                    mask,
                    np.int32(inh),
                    np.int32(inw),
                    np.int32(outh),
                    np.int32(outw),
                    np.int32(maps),
                    np.int32(size),
                    block=block,
                    grid=grid)

    return outdata
示例#5
0
def upsample2dBackward(grad, scale, mode="nearest", allocator=memPool):
	batchsize, maps, outh, outw = grad.shape
	hscale, wscale = (scale, scale) if isinstance(scale, int) else scale

	inh, inw = outh // hscale, outw // wscale

	if mode == "nearest":
		ingrad = GPUArray.empty((batchsize, maps, inh, inw), dtype=grad.dtype, allocator=allocator)

		blk = warpSize * 8
		block = (blk, 1, 1)
		grid = (roundUpDiv(ingrad.size, blk), 1, 1)

		nearestMod.upsample2dNearestBackward(
			ingrad, grad, np.int32(inw), np.int32(outw), np.int32(hscale), np.int32(wscale), np.int32(ingrad.size),
			block=block, grid=grid
		)

	elif mode == "linear":
		ingrad = GPUArray.zeros((batchsize, maps, inh, inw), dtype=grad.dtype, allocator=allocator)

		block = (warpSize, nthreads // warpSize, 1)
		grid = (roundUpDiv(outw, block[0]), roundUpDiv(outh, block[1]), 1)

		rh, rw = (inh - 1) / (outh - 1), (inw - 1) / (outw - 1)

		linearMod.upsample2dLinearBackward(
			ingrad, grad, np.int32(batchsize), np.int32(maps), np.int32(inh), np.int32(inw),
			np.int32(outh), np.int32(outw), np.float32(rh), np.float32(rw), block=block, grid=grid
		)

	else:
		raise NotImplementedError(mode)

	return ingrad
示例#6
0
def batchNorm3dTest(dtype, atol):
	batchsize, maps, d, h, w = 2, 5, 2, 3, 2
	epsilon, norm = 1e-5, batchsize * d * h * w

	hostData = np.random.randn(batchsize, maps, d, h, w).astype(dtype)

	hostScale = np.random.randn(1, maps, 1, 1, 1).astype(np.float32)
	hostBias = np.random.randn(1, maps, 1, 1, 1).astype(np.float32)

	data, scale, bias = GPUArray.toGpu(hostData), GPUArray.toGpu(hostScale.ravel()), GPUArray.toGpu(hostBias.ravel())
	mean, var = GPUArray.zeros(scale.shape, dtype=np.float32), GPUArray.toGpu(np.ones(scale.shape, dtype=np.float32))

	outdata, savemean, saveinvvar = context.batchNormNd(data, mean, var, scale, bias, epsilon=epsilon, out=data)

	hostMean = np.sum(hostData, axis=(0, 2, 3, 4), dtype=np.float32, keepdims=True) / norm

	hostInvVar = np.sum((hostData - hostMean) ** 2, axis=(0, 2, 3, 4), dtype=np.float32, keepdims=True) / norm
	hostInvVar = 1.0 / np.sqrt(hostInvVar + epsilon)

	hostNormData = (hostData - hostMean) * hostInvVar
	hostOutData = (hostNormData * hostScale + hostBias).astype(dtype)

	assert np.allclose(hostMean.ravel(), mean.get(), atol=atol)
	assert np.allclose(hostInvVar.ravel(), saveinvvar.get(), atol=atol)
	assert np.allclose(hostOutData, outdata.get(), atol=atol)

	hostGrad = np.random.randn(*outdata.shape).astype(dtype)

	grad, data = GPUArray.toGpu(hostGrad), GPUArray.toGpu(hostData)
	ingrad, scalegrad, biasgrad = context.batchNormNdBackward(grad, data, scale, savemean, saveinvvar, epsilon=epsilon)

	hostScaleGrad = np.sum(hostGrad * hostNormData, axis=(0, 2, 3, 4), dtype=np.float32, keepdims=True)
	hostBiasGrad = np.sum(hostGrad, axis=(0, 2, 3, 4), dtype=np.float32, keepdims=True)

	hostMeanGrad = -hostInvVar * hostBiasGrad * hostScale

	hostVarGrad = np.sum(hostGrad * (hostData - hostMean), axis=(0, 2, 3, 4), dtype=np.float32, keepdims=True)
	hostVarGrad = -0.5 * hostVarGrad * hostScale * hostInvVar**3

	hostInGrad = hostGrad * hostScale * hostInvVar + (2 * hostVarGrad * (hostData - hostMean) + hostMeanGrad) / norm
	hostInGrad = hostInGrad.astype(dtype)

	assert np.allclose(hostInGrad, ingrad.get(), atol=atol)
	assert np.allclose(hostScaleGrad.ravel(), scalegrad.get(), atol=atol)
	assert np.allclose(hostBiasGrad.ravel(), biasgrad.get(), atol=atol)

	hostMean = np.random.randn(*hostMean.shape).astype(np.float32)
	hostVar = 1.0 + np.random.randn(*hostInvVar.shape).astype(np.float32)**2

	mean, var = GPUArray.toGpu(hostMean.ravel()), GPUArray.toGpu(hostVar.ravel())
	outdata = context.batchNormNd(data, mean, var, scale, bias, test=True)

	hostOutData = ((hostData - hostMean) / np.sqrt(hostVar + epsilon) * hostScale + hostBias).astype(dtype)
	assert np.allclose(hostOutData, outdata.get(), atol=atol)
示例#7
0
文件: CTC.py 项目: rsarbaev/PuzzleLib
def ctcLoss(data, datalen, labels, lengths, blank, error=None, normalized=False, returnAlphas=False):
	T, batchsize, vocabsize = data.shape
	mx = 2 * np.max(lengths) + 1

	config = min(i for i, (NT, VT) in enumerate(configs) if mx <= NT * VT)
	mod, NT = modules[config], configs[config][0]

	if not normalized:
		data = cudnn.softmaxNd(data.reshape(T * batchsize, vocabsize, 1, 1), allocator=memPool).reshape(
			T, batchsize, vocabsize
		)

	offsets = np.cumsum(lengths, dtype=np.int32)
	extOffsets = np.empty(shape=(batchsize + 1, ), dtype=np.int32)

	extOffsets[0] = 0
	extOffsets[1:] = offsets

	alphas = GPUArray.empty((T * (2 * int(offsets[-1]) + batchsize), ), dtype=np.float32, allocator=memPool)
	offsets = GPUArray.toGpu(extOffsets, allocator=memPool)

	nll = GPUArray.empty((batchsize, ), dtype=np.float32, allocator=memPool)

	error = GPUArray.zeros((), dtype=np.float32, allocator=memPool) if error is None else error
	grad = GPUArray.zeros(data.shape, dtype=np.float32, allocator=memPool)

	mod.calcAlphas(
		data, datalen, np.int32(T), np.int32(vocabsize), labels, offsets, alphas, np.int32(blank),
		nll, error, block=(NT, 1, 1), grid=(batchsize, 1, 1)
	)

	mod.calcBetas(
		data, datalen, np.int32(T), np.int32(vocabsize), labels, offsets, alphas, np.int32(blank),
		nll, grad, block=(NT, 1, 1), grid=(batchsize, 1, 1)
	)

	return (error, grad) if not returnAlphas else (error, grad, alphas)
示例#8
0
def embed(data, W, allocator=memPool):
    assert data.dtype == np.int32 and W.dtype == np.float32

    batchsize, sentlen = data.shape
    _, embsize = W.shape

    outdata = GPUArray.zeros((batchsize, sentlen, embsize),
                             dtype=np.float32,
                             allocator=allocator)
    size = batchsize * sentlen

    block = (warpSize, warpSize, 1)
    grid = (roundUpDiv(embsize, warpSize), roundUpDiv(size, warpSize), 1)

    mod.embed(outdata,
              data,
              W,
              np.int32(size),
              np.int32(embsize),
              block=block,
              grid=grid)
    return outdata