def batchNorm2dTest(): batchsize, maps, h, w = 4, 5, 3, 2 data = CPUArray.toDevice( np.random.randn(batchsize, maps, h, w).astype(np.float32)) hostData = data.get() scale = CPUArray.toDevice( np.random.randn(1, maps, 1, 1).astype(np.float32)) bias = CPUArray.toDevice(np.random.randn(1, maps, 1, 1).astype(np.float32)) mean = CPUArray.toDevice(np.random.randn(1, maps, 1, 1).astype(np.float32)) var = CPUArray.toDevice( (np.ones((1, maps, 1, 1)).astype(np.float32) + np.random.randn(1, maps, 1, 1).astype(np.float32))**2) outdata = batchNorm2d(data, scale, bias, mean, var, test=True) hostScale, hostBias, hostMean, hostVar = scale.get(), bias.get(), mean.get( ), var.get() hostNormData = np.empty(hostData.shape, dtype=np.float32) hostOutData = np.empty(hostData.shape, dtype=np.float32) for c in range(maps): hostNormData[:, c, :, :] = (hostData[:, c, :, :] - hostMean[0, c, 0, 0] ) / np.sqrt(hostVar[0, c, 0, 0] + 1e-5) hostOutData[:, c, :, :] = hostNormData[:, c, :, :] * hostScale[ 0, c, 0, 0] + hostBias[0, c, 0, 0] assert np.allclose(hostOutData, outdata.get())
def crossEntropy(scores, labels, weights=None, error=None): assert scores.dtype == np.float32 and labels.dtype == np.int32 shape = scores.shape if scores.ndim < 4: scores = scores.reshape(*shape, *(1 for _ in range(4 - scores.ndim))) softmax = softmaxNd(scores) grad = CPUArray.empty(shape, dtype=np.float32) if error is None: error = CPUArray.empty((), dtype=np.float32) error.fill(0.0) spatialDim = int(np.prod(scores.shape[2:])) mapStride = spatialDim * scores.shape[1] if weights is None: ceMod.cost(softmax.data, labels.data, mapStride, spatialDim, scores.shape[1], scores.shape[0], error.data, grad.data, softmax.size) else: wceMod.cost(softmax.data, labels.data, weights.data, mapStride, spatialDim, shape[1], shape[0], error.data, grad.data, softmax.size) return error, grad
def svmTest(): batchsize, size = 20, 4 scores = CPUArray.toDevice( np.random.randn(batchsize, size).astype(np.float32)) labels = CPUArray.toDevice( np.random.randint(low=0, high=size, size=(batchsize, ), dtype=np.int32)) error, grad = svm(scores, labels, mode="l1") hostScores, hostLabels = scores.get(), labels.get() hostGrad = np.empty(grad.shape, dtype=np.float32) hostError = 0.0 for b in range(batchsize): for n in range(size): cls = 2 * (hostLabels[b] == n) - 1 val = hostScores[b, n] * cls hostGrad[b, n] = cls / batchsize / size if val < 1 else 0.0 hostError += max(0.0, 1.0 - val) / batchsize / size assert np.allclose(hostGrad, grad.get()) assert np.isclose(hostError, error.get() / scores.shape[0])
def conv2dTest(): batchsize, inmaps, h, w = 1, 2, 6, 6 fsize, outmaps = 2, 4 data = CPUArray.toDevice( np.random.randn(batchsize, inmaps, h, w).astype(np.float32)) W = CPUArray.toDevice( np.random.randn(outmaps, inmaps, fsize, fsize).astype(np.float32)) bias = CPUArray.toDevice( np.random.randn(1, outmaps, 1, 1).astype(np.float32)) outdata = conv2d(data, W, bias) hostData, hostW, hostBias = data.get(), W.get(), bias.get() hostOutData = np.empty(outdata.shape, dtype=np.float32) for c in range(outmaps): hostOutData[:, c, :, :] = hostBias[0, c, 0, 0] for b in range(batchsize): for oc in range(outmaps): for ic in range(inmaps): for y in range(outdata.shape[2]): for x in range(outdata.shape[3]): for dy in range(fsize): for dx in range(fsize): hostOutData[b, oc, y, x] += hostData[b, ic, y + dy, x + dx] * hostW[oc, ic, dy, dx] assert np.allclose(hostOutData, outdata.get())
def unittest(): batchsize, maps, h, w = 3, 4, 5, 5 epsilon = 1e-5 data = CPUArray.toDevice( np.random.randn(batchsize, maps, h, w).astype(np.float32)) scale = CPUArray.toDevice( np.random.randn(1, maps, 1, 1).astype(np.float32)) bias = CPUArray.toDevice(np.random.randn(1, maps, 1, 1).astype(np.float32)) outdata, savemean, savevar, extscale, extbias, desc = instanceNorm2d( data, scale, bias, epsilon) hostData = data.get().reshape(data.shape[0] * data.shape[1], -1) hostScale, hostBias = scale.get().reshape(maps, 1), bias.get().reshape(maps, 1) hostExtScale, hostExtBias = np.tile(hostScale, (batchsize, 1)), np.tile( hostBias, (batchsize, 1)) hostMean = np.mean(hostData, axis=1, keepdims=True) hostVar = np.var(hostData, axis=1) hostInvVar = 1.0 / np.sqrt(hostVar + epsilon) hostOutData = (hostData - hostMean) * hostInvVar[:, np.newaxis] hostOutScData = hostOutData * hostExtScale + hostExtBias assert np.allclose(hostOutScData.reshape(data.shape), outdata.get()) assert np.allclose(hostMean.reshape(savemean.shape), savemean.get()) assert np.allclose(hostVar.reshape(savevar.shape), savevar.get()) grad = CPUArray.toDevice( np.random.randn(batchsize, maps, h, w).astype(np.float32)) ingrad, scalegrad, bgrad = instanceNorm2dBackward(grad, data, extscale, extbias, savemean, savevar, epsilon, desc) hostGrad = grad.get().reshape(grad.shape[0] * grad.shape[1], -1) hostScGrad = hostGrad * hostExtScale hostCorrs = np.empty(hostInvVar.shape, dtype=np.float32) for i in range(hostCorrs.shape[0]): hostCorrs[i] = np.dot(hostScGrad[i], hostOutData[i]) / hostScGrad.shape[1] hostInGrad = hostScGrad - np.mean( hostScGrad, axis=1, keepdims=True) - hostCorrs[:, np.newaxis] * hostOutData hostInGrad *= hostInvVar[:, np.newaxis] hostScaleGrad = np.sum(np.sum(hostOutData * hostGrad, axis=1).reshape(batchsize, -1), axis=0) hostBiasGrad = np.sum(np.sum(hostGrad, axis=1).reshape(batchsize, -1), axis=0) assert np.allclose(hostInGrad.reshape(grad.shape), ingrad.get()) assert np.allclose(hostScaleGrad.reshape((1, maps, 1, 1)), scalegrad.get()) assert np.allclose(hostBiasGrad.reshape((1, maps, 1, 1)), bgrad.get())
def unittest(): A = CPUArray.toDevice(np.random.randn(5, 3).astype(np.float32)) B = CPUArray.toDevice(np.random.randn(3, 4).astype(np.float32)) C = mulMatrixOnMatrix(A, B) assert np.allclose(np.dot(A.get(), B.get()), C.get()) F = mulMatrixOnMatrix(B, C, transpB=True) assert np.allclose(np.dot(B.get(), C.get().T), F.get()) G = mulMatrixOnMatrix(F, B, transpA=True) assert np.allclose(np.dot(F.get().T, B.get()), G.get())
def eltwiseTest(): outdata = CPUArray.empty((10, ), dtype=np.float32) indata = CPUArray.toDevice(np.random.randn(10).astype(np.float32)) square = ElementwiseKernel([(float_t.ptr, "outdata"), (float_t.const.ptr, "indata")], "outdata[i] = indata[i] * indata[i]", "square") square(outdata, indata) hostInData = indata.get() hostOutData = hostInData * hostInData assert np.allclose(hostOutData, outdata.get())
def mulMatrixOnMatrix(A, B, out=None, transpA=False, transpB=False, alpha=1.0, beta=0.0): assert not (transpA and transpB) assert A.ndim == 2 and B.ndim == 2 assert alpha == 1.0 and beta == 0.0 if transpA: assert A.shape[0] == B.shape[0] shape = (A.shape[1], B.shape[1]) elif transpB: assert A.shape[1] == B.shape[1] shape = (A.shape[0], B.shape[0]) else: assert A.shape[1] == B.shape[0] shape = (A.shape[0], B.shape[1]) A = A.data.T if transpA else A.data B = B.data.T if transpB else B.data if out is None: out = CPUArray.empty(shape, dtype=np.float32) np.dot(A, B, out=out.data) return out
def maxpool2dTest(): batchsize, maps, h, w = 1, 1, 8, 8 data = CPUArray.toDevice( np.random.randn(batchsize, maps, h, w).astype(np.float32)) outdata = pool2d(data) def maxDownSample2d(dat, factor): trimrows = dat.shape[0] // factor * factor trimcols = dat.shape[1] // factor * factor maxSoFar = None first = True for coff in range(factor): for roff in range(factor): hopped = dat[roff:trimrows:factor, coff:trimcols:factor] if first: maxSoFar = hopped first = False else: maxSoFar = np.maximum(maxSoFar, hopped) return maxSoFar hostOutData = maxDownSample2d(data.get()[0, 0], 2) assert np.allclose(hostOutData, outdata.get())
def wrapTile(ary, times, axis): shape = (times, ) if axis > 0: shape = (1, ) * axis + shape if axis < ary.ndim - 1: shape = shape + (1, ) * (ary.ndim - 1 - axis) out = np.tile(ary.data, shape) return CPUArray(out.shape, out.dtype, data=out, acquire=True)
def addVectorToVector(x, y, out=None, alpha=1.0, beta=1.0): assert x.ndim == 1 assert x.flags.forc and y.flags.forc assert x.shape == y.shape assert x.dtype == y.dtype and x.dtype == np.float32 if out is None: out = CPUArray.empty(x.shape, dtype=np.float32) ElementWise.addVectorToVectorKer(out, x, y, alpha, beta) return out
def reflectpad1d(data, pad): assert data.dtype == np.float32 and data.ndim == 3 batchsize, maps, insize = data.shape lpad, rpad = pad assert insize >= max(lpad, rpad) + 1 outdata = CPUArray.empty((batchsize, maps, insize + lpad + rpad), dtype=data.dtype) mod.reflectpad1d(outdata.data, data.data, batchsize, maps, insize, lpad, rpad) return outdata
def reflectpad2d(data, pad): assert data.dtype == np.float32 and data.ndim == 4 batchsize, maps, inh, inw = data.shape upad, bpad, lpad, rpad = pad assert inh >= max(upad, bpad) + 1 and inw >= max(lpad, rpad) + 1 outdata = CPUArray.empty((batchsize, maps, inh + upad + bpad, inw + lpad + rpad), dtype=data.dtype) mod.reflectpad2d(outdata.data, data.data, batchsize, maps, inh, inw, upad, bpad, lpad, rpad) return outdata
def reflectpad1dTest(): batchsize, maps, insize = 4, 8, 48 lpad, rpad = 2, 3 data = CPUArray.toDevice(np.random.randn(batchsize, maps, insize).astype(np.float32)) outdata = reflectpad1d(data, pad=(lpad, rpad)) hostData, hostOutData = data.get(), outdata.get() assert np.allclose(hostOutData[:, :, lpad:insize + lpad], hostData) assert np.allclose(hostOutData[:, :, :lpad][:, :, ::-1], hostData[:, :, 1:lpad+1]) assert np.allclose(hostOutData[:, :, insize + lpad:][:, :, ::-1], hostData[:, :, insize - 1 - rpad:insize - 1])
def reductionTest(): data = CPUArray.toDevice(np.random.randn(10).astype(np.float32)) accumulate = ReductionKernel(np.float32, neutral="0.0f", reduceExpr="a + b", mapExpr="data[i]", arguments=[(float_t.const.ptr, "data")]) acc = accumulate(data) hostSum = np.sum(data.get()) assert np.allclose(hostSum, acc.get())
def upsample2d(data, scale, mode="nearest"): batchsize, maps, inh, inw = data.shape hscale, wscale = (scale, scale) if isinstance(scale, int) else scale outh, outw = hscale * inh, wscale * inw outdata = CPUArray.empty((batchsize, maps, outh, outw), dtype=data.dtype) if mode == "nearest": nearestMod.upsample2dNearest(outdata.data, data.data, batchsize, maps, inh, inw, hscale, wscale) else: raise ValueError("Unsupported upsampling mode") return outdata
def svm(scores, labels, mode, error=None): assert scores.dtype == np.float32 and labels.dtype == np.int32 shape = scores.shape grad = CPUArray.empty(shape, dtype=np.float32) if error is None: error = CPUArray.empty((), dtype=np.float32) error.fill(0.0) spatialDim = int(np.prod(scores.shape[2:])) mapStride = spatialDim * scores.shape[1] if mode == "l1": krl = svmL1Mod.cost elif mode == "l2": krl = svmL2Mod.cost else: raise ValueError() krl(scores.data, labels.data, mapStride, spatialDim, shape[1], shape[0], error.data, grad.data, scores.size) return error, grad
def batchNorm2d(data, scale, bias, mean, var, epsilon=1e-5, test=False, out=None): assert data.ndim == scale.ndim and scale.ndim == bias.ndim and bias.ndim == mean.ndim and mean.ndim == var.ndim assert test scale = scale.data / np.sqrt(var.data + epsilon) outdata = scale * (data.data - mean.data) + bias.data return CPUArray(outdata.shape, outdata.dtype, data=outdata, acquire=True)
def reflectpad2dTest(): batchsize, maps, inh, inw = 4, 8, 12, 15 upad, bpad, lpad, rpad = 2, 3, 2, 3 data = CPUArray.toDevice(np.random.randn(batchsize, maps, inh, inw).astype(np.float32)) outdata = reflectpad2d(data, pad=(upad, bpad, lpad, rpad)) hostData, hostOutData = data.get(), outdata.get() assert np.allclose(hostOutData[:, :, upad:inh + upad, lpad:inw + lpad], hostData) assert np.allclose(hostOutData[:, :, :upad, :lpad][:, :, ::-1, ::-1], hostData[:, :, 1:upad + 1, 1:lpad + 1]) assert np.allclose( hostOutData[:, :, inh + upad:, inw + lpad:][:, :, ::-1, ::-1], hostData[:, :, inh - 1 - bpad:inh - 1, inw - 1 - rpad:inw - 1] )
def crossEntropyTest(): scores = CPUArray.toDevice(np.random.randn(20, 10, 3).astype(np.float32)) labels = CPUArray.toDevice( np.random.randint(low=0, high=10, size=(20, 3)).astype(np.int32)) error, grad = crossEntropy(scores, labels) def softmax(w): e = np.exp(w - np.amax(w)) dist = e / np.sum(e) return dist def hostCrossEntropy(smax, target): smax = np.moveaxis(smax, 1, -1).reshape(-1, smax.shape[1]) target = target.flatten() err = np.sum( np.log(np.array([smax[i, target[i]] for i in range(smax.shape[0])]))) return -err / target.size def hostCrossEntropyGrad(target, smax): return np.array([(target == i) - smax[i] for i in range(smax.shape[0])]) hostSoftmax = np.apply_along_axis(softmax, 1, scores.get()) hostGrad = np.vstack([ hostCrossEntropyGrad(labels.get()[i], hostSoftmax[i]) / scores.shape[0] for i in range(scores.shape[0]) ]).reshape(*hostSoftmax.shape) assert np.allclose(hostGrad, grad.get()) hostError = hostCrossEntropy(hostSoftmax, labels.get()) assert np.isclose(hostError, error.get() / scores.shape[0])
def instanceNorm2d(data, scale, bias, epsilon=1e-5): batchsize = data.shape[0] if batchsize > 1: extscale = CPUArray.toDevice(np.tile(scale.data, (batchsize, 1, 1))) extbias = CPUArray.toDevice(np.tile(bias.data, (batchsize, 1, 1))) else: extscale = scale extbias = bias indata = data.reshape(1, batchsize * data.shape[1], data.shape[2], data.shape[3]) mean = CPUArray.empty((1, indata.shape[1], 1, 1), dtype=np.float32) var = CPUArray.empty((1, indata.shape[1], 1, 1), dtype=np.float32) outdata, savemean, savevar, desc = DNNL.batchNormNd(indata, extscale, extbias, mean, var, epsilon, test=False) return outdata.reshape( data.shape), savemean, savevar, extscale, extbias, desc
def instanceNorm2dBackward(grad, data, extscale, extbias, savemean, savevar, epsilon, desc, affine=True): batchsize, maps = grad.shape[:2] outgrad = grad.reshape(1, batchsize * grad.shape[1], grad.shape[2], grad.shape[3]) indata = data.reshape(1, batchsize * data.shape[1], data.shape[2], data.shape[3]) ingrad, scalegrad, biasgrad = DNNL.batchNormNdBackward( indata, outgrad, extscale, extbias, savemean, savevar, desc, epsilon) if affine and batchsize > 1: scalegrad = np.sum(scalegrad.data.reshape(batchsize, -1), axis=0).reshape((1, maps, 1, 1)) biasgrad = np.sum(biasgrad.data.reshape(batchsize, -1), axis=0).reshape((1, maps, 1, 1)) scalegrad = CPUArray(scalegrad.shape, scalegrad.dtype, data=scalegrad, acquire=True) biasgrad = CPUArray(biasgrad.shape, biasgrad.dtype, data=biasgrad, acquire=True) return (ingrad.reshape(grad.shape), scalegrad, biasgrad) if affine else ingrad.reshape(grad.shape)
def __call__(self, *args, **kwargs): if self.module is None: source, functions = self.generateSource() self.module = SourceModule(source, functions, converter=self.paramConverter, finalizer=self.funcFinalizer, debug=self.debug) acc = self.module.reduction( *(arg.data if isinstance(arg, CPUArray) else arg for arg in args)) result = CPUArray.empty((), self.outtype) result.fill(acc) return result
def sumOnMatrix(A, out=None, cols=True, alpha=1.0, beta=0.0): assert A.ndim == 2 assert A.flags.c_contiguous assert A.dtype == np.float32 if out is None: out = CPUArray.empty((A.shape[1], ) if cols else (A.shape[0], ), dtype=np.float32) if alpha == 1.0 and beta == 0.0: np.sum(A.data, axis=0 if cols else 1, out=out.data) else: s = np.sum(A.data, axis=0 if cols else 1) np.add(beta * out.data, alpha * s, out=out.data) return out
def pool2d(data, size=2, stride=2, pad=0, mode=PoolMode.max): assert data.ndim == 4 onRow = np.max if mode == PoolMode.max else np.mean batchsize, maps, inh, inw = data.shape size, stride, pad = repeatValue(size, 2), repeatValue(stride, 2), repeatValue(pad, 2) outh, outw = outshape((inh, inw), size, stride, pad) coldata = im2col(data.data.reshape(batchsize * maps, 1, inh, inw), size, stride, pad) outdata = onRow(coldata, axis=1, keepdims=True).reshape( (batchsize, maps, outh, outw)) return CPUArray(outdata.shape, outdata.dtype, data=outdata, acquire=True)
def conv2d(data, W, bias=None, stride=1, pad=0): assert data.ndim == 4 and W.ndim == 4 batchsize, _, inh, inw = data.shape stride, pad = repeatValue(stride, 2), repeatValue(pad, 2) outmaps, _, hsize, wsize = W.shape outh, outw = outshape((inh, inw), (hsize, wsize), stride, pad) coldata = im2col(data.data, W.shape[2:], stride, pad) W = W.data.reshape(W.shape[0], -1).T bias = bias.data.reshape(1, bias.shape[1]) if bias is not None else None outdata = linear(coldata, W, bias) outdata = col2im(outdata, outmaps, (outh, outw)) return CPUArray(outdata.shape, outdata.dtype, data=outdata, acquire=True)
def unittest(): batchsize, maps, inh, inw = 3, 2, 16, 15 scale = 2 data = CPUArray.toDevice(np.random.uniform(low=-1.0, high=1.0, size=(batchsize, maps, inh, inw)).astype(np.float32)) outdata = upsample2d(data, scale, mode="nearest") hostData = data.get() hostOutData = np.empty(outdata.shape, dtype=np.float32) for b in range(batchsize): for c in range(maps): for y in range(inh): for x in range(inw): hostOutData[b, c, y * scale:(y + 1) * scale, x * scale:(x + 1) * scale] = hostData[b, c, y, x] assert np.allclose(hostOutData, outdata.get())
def mulMatrixOnMatrix(A, B, out=None, transpA=False, transpB=False, alpha=1.0, beta=0.0): assert not (transpA and transpB) assert A.ndim == 2 and B.ndim == 2 assert A.dtype == B.dtype and A.dtype == np.float32 assert A.flags.c_contiguous and B.flags.c_contiguous if transpA: assert A.shape[0] == B.shape[0] shape = (A.shape[1], B.shape[1]) elif transpB: assert A.shape[1] == B.shape[1] shape = (A.shape[0], B.shape[0]) else: assert A.shape[1] == B.shape[0] shape = (A.shape[0], B.shape[1]) if out is None: out = CPUArray.empty(shape, dtype=np.float32) if transpA: k, m = A.shape n = B.shape[1] libdnnl.dnnl_sgemm('t', 'n', m, n, k, alpha, A.ptr, m, B.ptr, n, beta, out.ptr, n) elif transpB: m, k = A.shape n = B.shape[0] libdnnl.dnnl_sgemm('n', 't', m, n, k, alpha, A.ptr, k, B.ptr, k, beta, out.ptr, n) else: m, k = A.shape n = B.shape[1] libdnnl.dnnl_sgemm('n', 'n', m, n, k, alpha, A.ptr, k, B.ptr, n, beta, out.ptr, n) return out
def build(self): nbytes = 0 for reg in self.regs: shape, dtype, _ = reg assert dtype == self.dtype nbytes += int(np.prod(shape) * dtype(0).itemsize) self.mem = CPUArray.empty((nbytes, ), np.uint8) offset = 0 for shape, dtype, name in self.regs: regbytes = int(np.prod(shape) * dtype(0).itemsize) assert offset + regbytes <= self.mem.size self.blocks[name] = self.mem[offset:offset + regbytes].view(dtype).reshape(shape) offset += regbytes self.regs.clear() self.ary = self.mem.view(dtype=self.dtype)
def recognize(self, audio_path): preprocessed_audio = preprocess(audio_path, self.sample_rate, self.window_size, self.window_stride) if self.cpu: from PuzzleLib.CPU.CPUArray import CPUArray inputs = CPUArray.toDevice(np.array([preprocessed_audio]).astype(np.float32)) else: from PuzzleLib.Backend import gpuarray inputs = gpuarray.to_gpu(np.array([preprocessed_audio]).astype(np.float16)) output = self.w2l(inputs).get() output = np.vstack(output).astype(np.float32) result = self.decoder.decode(output) if not self.cpu: from PuzzleLib.Backend.gpuarray import memoryPool memoryPool.freeHeld() del inputs, output return result