Пример #1
0
def prelu(data, slopes, inplace=False, sharedMaps=False):
    assert data.dtype == slopes.dtype and slopes.dtype == np.float32

    if sharedMaps:
        assert slopes.shape == (1, )
    else:
        assert data.shape[1] == slopes.shape[0]

    outdata = data if inplace else Driver.empty(
        queue, data.shape, dtype=np.float32, allocator=memPool)

    kernel = mod.prelu

    mapsize = np.prod(data.shape[2:])
    size = int(np.prod(data.shape))

    block = (nthreads, 1, 1)
    grid = (roundUp(size, nthreads), 1, 1)

    divFactor = data.shape[1] if sharedMaps else 1

    kernel(queue, grid, block, outdata.data, data.data, slopes.data,
           np.int32(divFactor), np.int32(mapsize), np.int32(data.shape[1]),
           np.int32(size))

    return outdata
Пример #2
0
def svm(scores, labels, mode, error=None):
    assert scores.dtype == np.float32 and labels.dtype == np.int32
    shape = scores.shape

    grad = Driver.empty(queue, shape, dtype=np.float32, allocator=memPool)
    if error is None:
        error = Driver.empty(queue, (), dtype=np.float32, allocator=memPool)

    error.fill(0.0)

    size = int(np.prod(scores.shape))
    spatialDim = int(np.prod(scores.shape[2:]))
    mapStride = spatialDim * scores.shape[1]

    block = (nthreads, 1, 1)
    grid = (roundUp(size, nthreads), 1, 1)

    if mode == "l1":
        krl = svmL1Mod.cost
    elif mode == "l2":
        krl = svmL2Mod.cost
    else:
        raise ValueError()

    krl(queue, grid, block, scores.data, labels.base_data,
        np.int32(labels.offset // labels.dtype.itemsize), np.int32(size),
        np.int32(mapStride), np.int32(spatialDim), np.int32(shape[1]),
        np.int32(shape[0]), error.data, grad.data)

    return error, grad
Пример #3
0
def preluBackwardData(grad, slopes, indata, sharedMaps=False):
    assert grad.dtype == slopes.dtype and slopes.dtype == indata.dtype and indata.dtype == np.float32
    assert grad.shape == indata.shape

    if sharedMaps:
        assert slopes.shape == (1, )
    else:
        assert grad.shape[1] == slopes.shape[0]

    ingrad = Driver.empty(queue,
                          grad.shape,
                          dtype=np.float32,
                          allocator=memPool)

    kernel = mod.preluBackwardData

    mapsize = np.prod(grad.shape[2:])
    size = int(np.prod(grad.shape))

    block = (nthreads, 1, 1)
    grid = (roundUp(size, nthreads), 1, 1)

    divFactor = grad.shape[1] if sharedMaps else 1

    kernel(queue, grid, block, ingrad.data, grad.data, slopes.data,
           indata.data, np.int32(divFactor), np.int32(mapsize),
           np.int32(grad.shape[1]), np.int32(size))

    return ingrad
Пример #4
0
def preluBackwardParams(indata, outgrad, sharedMaps=False):
    assert indata.dtype == outgrad.dtype and outgrad.dtype == np.float32
    assert indata.shape == outgrad.shape

    kernel = mod.preluBackwardParams

    size = int(np.prod(outgrad.shape[1:]))
    stride = np.prod(outgrad.shape[1:])

    block = (nthreads, 1, 1)
    grid = (roundUp(size, nthreads), 1, 1)

    slopegrad = Driver.empty(queue,
                             outgrad.shape[1:],
                             dtype=np.float32,
                             allocator=memPool)

    kernel(queue, grid, block, slopegrad.data, outgrad.data, indata.data,
           np.int32(outgrad.shape[0]), np.int32(stride), np.int32(size))

    if sharedMaps:
        shape = (1, int(np.prod(slopegrad.shape)))
    else:
        shape = (slopegrad.shape[0], int(np.prod(slopegrad.shape[1:])))

    slopegrad = CLBlas.sumOnMatrix(slopegrad.reshape(shape), cols=False)

    return slopegrad
Пример #5
0
def embedBackwardParams(indata, grad, W, scale):
    assert indata.shape == grad.shape[:2] and W.shape[1] == grad.shape[2]
    assert indata.dtype == np.int32 and grad.dtype == W.dtype and W.dtype == np.float32

    batchsize, sentlen = indata.shape
    _, embsize = W.shape

    size = batchsize * sentlen
    kernel = mod.embedBackwardParams

    block = (warpSize // 4, warpSize // 4, 1)
    grid = (roundUp(embsize, block[0]), roundUp(size, block[1]), 1)

    kernel(queue, grid, block, W.data, grad.data, indata.base_data,
           np.int32(indata.item_offset), np.float32(scale), np.int32(size),
           np.int32(embsize))
Пример #6
0
def addVecToMat(vec, mat, axis=0, inplace=True, out=None, alpha=1.0, beta=1.0, gamma=0.0):
	assert vec.dtype == mat.dtype and mat.dtype == np.float32
	assert vec.ndim == 1 and mat.ndim == 2
	if axis == 0:
		assert vec.shape[0] == mat.shape[0]
	elif axis == 1:
		assert vec.shape[0] == mat.shape[1] or mat.shape[1] % vec.shape[0] == 0

	block = (16, 16, 1)

	if gamma != 0.0:
		mod = gammaVecMatMod
	else:
		mod = nonGammaVecMatMod

	n, m = mat.shape
	gridx = roundUp(m, block[0])
	gridy = roundUp(n, block[1])
	grid = (gridx, gridy, 1)

	if inplace:
		out = mat

	elif out is None:
		out = Driver.empty(queue, mat.shape, dtype=np.float32, allocator=memPool)

	if axis == 0:
		colKernel = mod.opColVecToMat
		colKernel(queue, grid, block, mat.data, vec.data, out.data, np.int32(n), np.int32(m),
				  np.float32(beta), np.float32(alpha), np.float32(gamma))

	elif axis == 1:
		if vec.shape[0] == mat.shape[1]:
			rowKernel = mod.opRowVecToMat
			rowKernel(queue, grid, block, mat.data, vec.data, out.data, np.int32(n), np.int32(m),
					  np.float32(beta), np.float32(alpha), np.float32(gamma))
		else:
			rowKernel = mod.opRowOneVecToMat
			rowKernel(queue, grid, block, mat.data, vec.data, out.data, np.int32(n), np.int32(m),
					  np.int32(vec.shape[0]), np.float32(beta), np.float32(alpha), np.float32(gamma))

	else:
		raise ValueError("Unknown axis %s was given" % axis)

	return out
Пример #7
0
def upsample2dBackward(grad, scale, mode="nearest"):
    batchsize, maps, outh, outw = grad.shape

    if isinstance(scale, int):
        hscale, wscale = scale, scale
    else:
        hscale, wscale = scale

    inh, inw = outh // hscale, outw // wscale

    if mode == "nearest":
        ingrad = Driver.empty(queue, (batchsize, maps, inh, inw),
                              dtype=grad.dtype,
                              allocator=memPool)

        blk = warpSize * 4
        block = (blk, 1, 1)
        grid = (roundUp(ingrad.size, blk), 1, 1)

        kernel = nearestMod.upsample2dNearestBackward
        kernel(queue, grid, block, ingrad.data, grad.data, np.int32(inw),
               np.int32(outw), np.int32(hscale), np.int32(wscale),
               np.int32(ingrad.size))

    elif mode == "linear":
        ingrad = Driver.zeros(queue, (batchsize, maps, inh, inw),
                              dtype=grad.dtype,
                              allocator=memPool)

        block = (warpSize // 4, warpSize // 4, 1)
        grid = (roundUp(outw, block[0]), roundUp(outh, block[1]), 1)

        rh = (inh - 1) / (outh - 1)
        rw = (inw - 1) / (outw - 1)

        kernel = linearMod.upsample2dLinearBackward
        kernel(queue, grid, block, ingrad.data, grad.data, np.int32(batchsize),
               np.int32(maps), np.int32(inh), np.int32(inw), np.int32(outh),
               np.int32(outw), np.float32(rh), np.float32(rw))

    else:
        raise ValueError("Unrecognized sampling mode")

    return ingrad
Пример #8
0
def upsample3d(data, scale, mode="nearest"):
    batchsize, maps, ind, inh, inw = data.shape

    if isinstance(scale, int):
        dscale, hscale, wscale = scale, scale, scale
    else:
        dscale, hscale, wscale = scale

    outd, outh, outw = dscale * ind, hscale * inh, wscale * inw
    outdata = Driver.empty(queue, (batchsize, maps, outd, outh, outw),
                           dtype=data.dtype,
                           allocator=memPool)

    if mode == "nearest":
        block = (wblocksize, hblocksize, 1)
        grid = (roundUp(inw,
                        block[0]), roundUp(inh,
                                           block[1]), batchsize * maps * ind)

        kernel = nearestMod.upsample3dNearest
        kernel(queue, grid, block, outdata.data, data.data, np.int32(ind),
               np.int32(inh), np.int32(inw), np.int32(outd), np.int32(outh),
               np.int32(outw), np.int32(dscale), np.int32(hscale),
               np.int32(wscale))

    elif mode == "linear":
        rd = (ind - 1) / (outd - 1)
        rh = (inh - 1) / (outh - 1)
        rw = (inw - 1) / (outw - 1)

        block = (warpSize // 4, warpSize // 4, 1)
        grid = (roundUp(outw, block[0]), roundUp(outh, block[1]), outd)

        kernel = linearMod.upsample3dLinear
        kernel(queue, grid, block, outdata.data,
               data.data, np.int32(batchsize), np.int32(maps), np.int32(ind),
               np.int32(inh), np.int32(inw), np.int32(outd), np.int32(outh),
               np.int32(outw), np.float32(rd), np.float32(rh), np.float32(rw))

    else:
        raise ValueError("Unsupported upsampling mode")

    return outdata
Пример #9
0
def embed(data, W):
    assert data.dtype == np.int32 and W.dtype == np.float32

    batchsize, sentlen = data.shape
    _, embsize = W.shape

    outdata = Driver.zeros(queue, (batchsize, sentlen, embsize),
                           dtype=np.float32,
                           allocator=memPool)

    size = batchsize * sentlen
    kernel = mod.embed

    block = (warpSize // 4, warpSize // 4, 1)
    grid = (roundUp(embsize, block[0]), roundUp(size, block[1]), 1)

    kernel(queue, grid, block, outdata.data, data.base_data,
           np.int32(data.item_offset), W.data, np.int32(size),
           np.int32(embsize))
    return outdata
Пример #10
0
def maxunpool2dBackward(grad, poolshape, mask):
    assert grad.dtype == np.float32 and mask.dtype == np.int32
    batchsize, maps, outh, outw = grad.shape

    inh, inw = poolshape[2], poolshape[3]

    ingrad = Driver.empty(queue, (batchsize, maps, inh, inw),
                          dtype=np.float32,
                          allocator=memPool)

    kernel = mod.maxunpool2dBackward

    size = int(np.prod(ingrad.shape))

    block = (nthreads, 1, 1)
    grid = (roundUp(size, nthreads), 1, 1)

    kernel(queue, grid, block, ingrad.data, grad.data, mask.data,
           np.int32(inh), np.int32(inw), np.int32(outh), np.int32(outw),
           np.int32(maps), np.int32(size))

    return ingrad
Пример #11
0
def maxunpool2d(data, origshape, mask):
    assert data.dtype == np.float32
    batchsize, maps, inh, inw = data.shape

    outh, outw = origshape[2], origshape[3]

    outdata = Driver.zeros(queue, (batchsize, maps, outh, outw),
                           dtype=np.float32,
                           allocator=memPool)

    kernel = mod.maxunpool2d

    size = int(np.prod(data.shape))

    block = (nthreads, 1, 1)
    grid = (roundUp(size, nthreads), 1, 1)

    kernel(queue, grid, block, outdata.data, data.data, mask.data,
           np.int32(inh), np.int32(inw), np.int32(outh), np.int32(outw),
           np.int32(maps), np.int32(size))

    return outdata
Пример #12
0
def crossEntropy(scores, labels, weights=None, error=None):
    assert scores.dtype == np.float32 and labels.dtype == np.int32

    shape = scores.shape
    if scores.ndim < 4:
        scores = scores.reshape(*shape, *(1 for _ in range(4 - scores.ndim)))

    softmax = softmax2d(scores)

    grad = Driver.empty(queue, shape, dtype=np.float32, allocator=memPool)
    if error is None:
        error = Driver.empty(queue, (), dtype=np.float32, allocator=memPool)

    error.fill(0.0)

    size = int(np.prod(scores.shape))
    spatialDim = int(np.prod(scores.shape[2:]))
    mapStride = spatialDim * scores.shape[1]

    block = (nthreads, 1, 1)
    grid = (roundUp(size, nthreads), 1, 1)

    if weights is None:
        ceMod.cost(queue, grid, block, softmax.data, labels.base_data,
                   np.int32(labels.offset // labels.dtype.itemsize),
                   np.int32(size), np.int32(mapStride), np.int32(spatialDim),
                   np.int32(scores.shape[1]), np.int32(scores.shape[0]),
                   error.data, grad.data)

    else:
        wceMod.cost(queue, grid, block, softmax.data, labels.base_data,
                    np.int32(labels.offset // labels.dtype.itemsize),
                    weights.data, np.int32(size), np.int32(mapStride),
                    np.int32(spatialDim), np.int32(shape[1]),
                    np.int32(shape[0]), error.data, grad.data)

    return error, grad
Пример #13
0
def transformTensor(tensor, strides, inoffset=0, outoffset=0, shape=None, out=None):
	assert tensor.dtype == np.float32 and tensor.ndim <= 5
	assert tensor.ndim == len(strides)

	if shape is None:
		shape = tensor.shape

	size = np.prod(shape)
	ndim = len(shape)

	if out is None:
		out = Driver.empty(queue, shape, dtype=tensor.dtype, allocator=memPool)

	instrides = tuple(s // tensor.dtype.itemsize for s in tensor.strides)
	outstrides = tuple(s // tensor.dtype.itemsize for s in strides)

	sh0, sh1, sh2, sh3, sh4 = 0, 0, 0, 0, 0
	ss0, ss1, ss2, ss3 = 0, 0, 0, 0

	ds0, ds1, ds2, ds3, ds4 = 0, 0, 0, 0, 0

	if ndim == 1:
		block = (warpSize * 4, 1, 1)
		grid = (min(roundUp(size, block[0]), block[0] * 256), 1, 1)

		sh0 = shape[0]
		ds0 = outstrides[0]

	elif ndim == 2:
		block = (warpSize * 4, 1, 1)
		grid = (block[0], min(shape[0], 256), 1)

		sh0, sh1 = shape[:2]
		ss0 = instrides[0]

		ds0, ds1 = outstrides[:2]

	elif ndim == 3:
		block = (warpSize, 1, 1)
		grid = (block[0], min(shape[1], 64), min(shape[0], 16))

		sh0, sh1, sh2 = tensor.shape[:3]
		ss0, ss1 = instrides[:2]

		ds0, ds1, ds2 = outstrides[:3]

	elif ndim == 4:
		block = (warpSize, 1, 1)
		grid = (block[0], min(shape[2], 64), min(shape[1], 16))

		sh0, sh1, sh2, sh3 = shape[:4]
		ss0, ss1, ss2 = instrides[:3]

		ds0, ds1, ds2, ds3 = outstrides[:4]

	elif ndim == 5:
		block = (warpSize, 1, 1)
		grid = (block[0], min(shape[3], 64), min(shape[2], 16))

		sh0, sh1, sh2, sh3, sh4 = shape
		ss0, ss1, ss2, ss3 = instrides[:4]

		ds0, ds1, ds2, ds3, ds4 = outstrides

	else:
		raise NotImplementedError()

	inoffset //= tensor.dtype.itemsize
	outoffset //= tensor.dtype.itemsize

	mod.transform(queue, grid, block, tensor.base_data, out.base_data,
				  np.int32(tensor.offset + inoffset), np.int32(ss0), np.int32(ss1), np.int32(ss2), np.int32(ss3),
				  np.int32(sh0), np.int32(sh1), np.int32(sh2), np.int32(sh3), np.int32(sh4), np.int32(tensor.size),
				  np.int32(out.offset + outoffset), np.int32(ds0), np.int32(ds1), np.int32(ds2), np.int32(ds3),
				  np.int32(ds4), np.int32(out.size), np.int32(ndim))
	return out