def gemm_directly(bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsx, subsy, direction): ishape = (bs, ch, rImg1, rImg2) kshape = (nf, ch, rFlt1, rFlt2) subsample = (subsx, subsy) npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32') npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32') i = cuda_tensor4() k = cuda_tensor4() if direction == 'fprop': cpuval = py_conv(npy_img, npy_kern, 'valid', subsample) op = theano.sandbox.cuda.blas.GpuCorrMM(border_mode='valid', subsample=subsample)(i, k) f = theano.function([i, k], op, mode=theano_mode) gpuval = f(npy_img, npy_kern[:,:,::-1,::-1]) elif direction == 'bprop img': cpuval = py_conv(npy_img, npy_kern, 'full', subsample) op = theano.sandbox.cuda.blas.GpuCorrMM_gradInputs( border_mode='valid', subsample=subsample)(i, k) f = theano.function([i, k], op, mode=theano_mode) gpuval = f(npy_kern.transpose(1, 0, 2, 3), npy_img) elif direction == 'bprop kern': cpuval = py_conv(npy_img, npy_kern, 'valid', subsample) op = theano.sandbox.cuda.blas.GpuCorrMM_gradWeights( border_mode='valid', subsample=subsample)(i, k) f = theano.function([i, k], op, mode=theano_mode) gpuval = numpy.array(f( npy_img.transpose(1, 0, 2, 3), npy_kern.transpose(1, 0, 2, 3)[:,:,::-1,::-1])).transpose( 1, 0, 2, 3) assert_allclose(cpuval, gpuval, rtol=1e-4)
def test_compare_1D_and_2D_upsampling_values(self): """Compare 1D and 2D upsampling This method verifies the bilinear upsampling done by using 1D and 2D kernels will generate the same result. """ # checking upsampling with ratio 5 input_x = np.random.rand(5, 4, 6, 7).astype(theano.config.floatX) mat_1D = bilinear_upsampling(input=input_x, ratio=5, batch_size=5, num_input_channels=4, use_1D_kernel=True) mat_2D = bilinear_upsampling(input=input_x, ratio=5, batch_size=5, num_input_channels=4, use_1D_kernel=False) f_1D = theano.function([], mat_1D, mode=self.compile_mode) f_2D = theano.function([], mat_2D, mode=self.compile_mode) utt.assert_allclose(f_1D(), f_2D(), rtol=1e-06) # checking upsampling with ratio 8 input_x = np.random.rand(12, 11, 10, 7).astype(theano.config.floatX) mat_1D = bilinear_upsampling(input=input_x, ratio=8, batch_size=12, num_input_channels=11, use_1D_kernel=True) mat_2D = bilinear_upsampling(input=input_x, ratio=8, batch_size=12, num_input_channels=11, use_1D_kernel=False) f_1D = theano.function([], mat_1D, mode=self.compile_mode) f_2D = theano.function([], mat_2D, mode=self.compile_mode) utt.assert_allclose(f_1D(), f_2D(), rtol=1e-06)
def run_conv_valid(self, inputs_shape, filters_shape, pad=False): inputs_val = numpy.random.random(inputs_shape).astype('float32') filters_val = numpy.random.random(filters_shape).astype('float32') inputs = shared(inputs_val) filters = shared(filters_val) bias = shared(numpy.zeros(filters_shape[0]).astype('float32')) # Flip filter as conv3D compute correlation filters_flip = filters[:, ::-1, ::-1, ::-1, :] # filters_flip = filters conv_ref = theano.tensor.nnet.conv3D(V=inputs, W=filters_flip, b=bias, d=(1, 1, 1)) conv_fft = theano.sandbox.cuda.fftconv.conv3d_fft( inputs.dimshuffle(0, 4, 1, 2, 3), filters.dimshuffle(0, 4, 1, 2, 3), border_mode="valid", pad_last_dim=pad) conv_fft = conv_fft.dimshuffle(0, 2, 3, 4, 1) f_ref = theano.function([], conv_ref, mode="FAST_RUN") mode = mode_with_gpu mode.check_py_code = False f_fft = theano.function([], conv_fft, mode=mode) res_ref = f_ref() res_fft = f_fft() utt.assert_allclose(res_ref, res_fft, rtol=1e-05, atol=1e-05)
def test_cast_float16(self): f16 = theano.tensor.vector(dtype='float16') f32 = theano.tensor.fvector() i8 = theano.tensor.bvector() f = theano.function([f16, f32, i8], [f16.astype('float32'), f32.astype('float16'), f32.astype('float64'), f16.astype('int8'), f32.astype('int8'), i8.astype('float16'), i8.astype('float32')], mode=mode_with_gpu) d1 = (np.random.rand(4) * 10).astype('float16') d2 = (np.random.rand(5) * 10).astype('float32') d3 = (np.random.rand(6) * 10).astype('int8') res = f(d1, d2, d3) for i, out in enumerate(f.outputs): dtype = out.variable.dtype assert res[i].dtype == dtype inp = out.variable.owner.inputs[0] if inp.dtype == 'float16': d = d1 elif inp.dtype == 'float32': d = d2 else: d = d3 assert_allclose(d.astype(dtype), res[i])
def run_gradinput(self, inputs_shape, filters_shape, subsample=(1, 1, 1)): inputs_val = numpy.random.random(inputs_shape).astype('float32') filters_val = numpy.random.random(filters_shape).astype('float32') inputs = shared(inputs_val) filters = shared(filters_val) bias = shared(numpy.zeros(filters_shape[4]).astype('float32')) conv = theano.tensor.nnet.convTransp3D(W=filters, b=bias, d=subsample, H=inputs) f_ref = theano.function([], conv) res_ref = f_ref() # Get bottom shape using convTransp3D bottom_shape = res_ref.shape bottom_val = numpy.random.random(bottom_shape).astype('float32') bottom = shared(bottom_val) weight = gpu_contiguous(filters.dimshuffle(0, 4, 1, 2, 3)) top = gpu_contiguous(inputs.dimshuffle(0, 4, 1, 2, 3)) if (subsample == (1, 1, 1)): conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)( kern=weight, topgrad=top) else: conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)( kern=weight, topgrad=top, shape=bottom.shape[1:4]) conv_gemm = conv_gemm.dimshuffle(0, 2, 3, 4, 1) f = theano.function([], conv_gemm, mode=mode_with_gpu) res = f() utt.assert_allclose(res_ref, res)
def test_elemwise_pow(): # Test that GpuElemwise(pow) can compile with any combination of integer # or float input dtype. dtypes = ["uint8", "uint16", "uint32", "uint64", "int8", "int16", "int32", "int64", "float16", "float32", "float64"] for dtype_base in dtypes: for dtype_exp in dtypes: # Compile a gpu function with the specified dtypes base_val = np.random.randint(0, 5, size=10).astype(dtype_base) exp_val = np.random.randint(0, 3, size=10).astype(dtype_exp) base = theano.tensor.vector(dtype=dtype_base) exp = gpuarray_shared_constructor(exp_val) assert exp.dtype == dtype_exp output = base ** exp f = theano.function([base], output, mode=mode_with_gpu) theano.printing.debugprint(f) # We don't transfer to the GPU when the output dtype is int* n = len([n for n in f.maker.fgraph.apply_nodes if isinstance(n.op, GpuElemwise)]) assert n == (output.dtype in tensor.float_dtypes) # Call the function to make sure the output is valid out = f(base_val) expected_out = base_val ** exp_val assert_allclose(out, expected_out)
def test_hgemm_swap(): from theano.sandbox.cuda import nvcc_compiler if nvcc_compiler.nvcc_version < '7.5': raise SkipTest("SgemmEx is only avaialble on cuda 7.5+") v = tensor.vector(dtype='float16') m = tensor.matrix(dtype='float16') m2 = tensor.matrix(dtype='float16') m32 = tensor.matrix(dtype='float32') # test that we don't try to replace anything but matrix x matrix in float16 f = theano.function([v, m], tensor.dot(v, m), mode=mode_with_gpu) assert len([node for node in f.maker.fgraph.apply_nodes if isinstance(node.op, GpuGemm)]) == 0 f = theano.function([m32, m], tensor.dot(m32, m), mode=mode_with_gpu) assert len([node for node in f.maker.fgraph.apply_nodes if isinstance(node.op, GpuGemm)]) == 0 f = theano.function([m, m2], tensor.dot(m, m2), mode=mode_with_gpu) assert len([node for node in f.maker.fgraph.apply_nodes if isinstance(node.op, GpuGemm)]) == 1 v1 = numpy.random.random((3, 4)).astype('float16') v2 = numpy.random.random((4, 2)).astype('float16') of = f(v1, v2) on = numpy.dot(v1, v2) utt.assert_allclose(of, on)
def test_opt_conv3d_gemm(self): inputs_shape = (16, 20, 32, 16, 1) filters_shape = (10, 6, 12, 4, 1) inputs_val = numpy.random.random(inputs_shape).astype('float32') filters_val = numpy.random.random(filters_shape).astype('float32') inputs = shared(inputs_val) filters = shared(filters_val) bias = shared(numpy.zeros(filters_shape[0]).astype('float32')) conv = theano.tensor.nnet.conv3D(V=inputs, W=filters, b=bias, d=(1, 1, 1)) mode = mode_with_gpu.including('conv3d_gemm') mode.check_py_code = False f_ref = theano.function([], conv, mode="FAST_RUN") f_gemm = theano.function([], conv, mode=mode) # make sure we inserted the gemm trickery topo = f_gemm.maker.fgraph.toposort() assert sum(isinstance(n.op, GpuCorr3dMM) for n in topo) > 0 res_ref = f_ref() res_gemm = f_gemm() utt.assert_allclose(res_ref, res_gemm)
def test_DownsampleFactorMax(self): rng = numpy.random.RandomState(utt.fetch_seed()) # generate random images maxpoolshps = ((1, 1), (2, 2), (3, 3), (2, 3)) imval = rng.rand(4, 2, 16, 16) images = tensor.dtensor4() for maxpoolshp, ignore_border, mode in product(maxpoolshps, [True, False], ['max', 'sum', 'average_inc_pad', 'average_exc_pad']): # print 'maxpoolshp =', maxpoolshp # print 'ignore_border =', ignore_border # Pure Numpy computation numpy_output_val = self.numpy_max_pool_2d(imval, maxpoolshp, ignore_border, mode=mode) output = max_pool_2d(images, maxpoolshp, ignore_border, mode=mode) f = function([images, ], [output, ]) output_val = f(imval) utt.assert_allclose(output_val, numpy_output_val) # DownsampleFactorMax op maxpool_op = DownsampleFactorMax(maxpoolshp, ignore_border=ignore_border, mode=mode)(images) f = function([images], maxpool_op) output_val = f(imval) utt.assert_allclose(output_val, numpy_output_val)
def test_dnn_conv_merge(): if not cuda.dnn.dnn_available(): raise SkipTest(cuda.dnn.dnn_available.msg) img = T.ftensor4() kern = T.ftensor4() out = T.ftensor4() b = 1 c = 4 f = 3 ih = 5 iw = 8 kh = 2 kw = 6 img_val = numpy.random.random((b, c, ih, iw)).astype("float32") kern_val = numpy.random.random((f, c, kh, kw)).astype("float32") out_val = numpy.random.random((b, f, ih - kh + 1, iw - kw + 1)).astype("float32") conv = dnn.dnn_conv(img, kern) gw = theano.grad(conv.sum(), kern) gi = theano.grad(conv.sum(), img) lr = numpy.asarray(0.05, dtype="float32") if cuda.dnn.version() == -1: # Can't merge alpha with cudnn v1 fr = conv + out wr = kern + gw ir = img + gi else: fr = lr * (conv + out) wr = kern + lr * gw ir = img + lr * gi f1 = theano.function([img, kern, out], [fr, wr, ir], mode=mode_with_gpu) assert isinstance(f1.maker.fgraph.outputs[0].owner.inputs[0].owner.op, dnn.GpuDnnConv) assert isinstance(f1.maker.fgraph.outputs[1].owner.inputs[0].owner.op, dnn.GpuDnnConvGradW) assert isinstance(f1.maker.fgraph.outputs[2].owner.inputs[0].owner.op, dnn.GpuDnnConvGradI) mode = mode_with_gpu mode = mode.excluding("local_dnn_conv_alpha_merge") mode = mode.excluding("local_dnn_convw_alpha_merge") mode = mode.excluding("local_dnn_convi_alpha_merge") mode = mode.excluding("local_dnn_conv_output_merge") mode = mode.excluding("local_dnn_convw_output_merge") mode = mode.excluding("local_dnn_convi_output_merge") f2 = theano.function([img, kern, out], [fr, wr, ir], mode=mode) assert not isinstance(f2.maker.fgraph.outputs[0].owner.inputs[0].owner.op, dnn.GpuDnnConv) assert not isinstance(f2.maker.fgraph.outputs[1].owner.inputs[0].owner.op, dnn.GpuDnnConvGradW) assert not isinstance(f2.maker.fgraph.outputs[2].owner.inputs[0].owner.op, dnn.GpuDnnConvGradI) out_f1 = f1(img_val, kern_val, out_val) out_f2 = f2(img_val, kern_val, out_val) assert len(out_f1) == len(out_f2) for v1, v2 in zip(out_f1, out_f2): utt.assert_allclose(v1, v2)
def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape, subsample=(1, 1, 1)): inputs_val = numpy.random.random(inputs_shape).astype('float32') dCdH_val = numpy.random.random(dCdH_shape).astype('float32') inputs = shared(inputs_val) dCdH = shared(dCdH_val) conv = theano.tensor.nnet.convGrad3D(V=inputs, dCdH=dCdH, WShape=filters_shape, d=subsample) img = gpu_contiguous(inputs.dimshuffle(0, 4, 1, 2, 3)) topgrad = gpu_contiguous(dCdH.dimshuffle(0, 4, 1, 2, 3)) if (subsample == (1, 1, 1)): conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)(img, topgrad) else: conv_gemm = GpuCorr3dMM_gradWeights(subsample=subsample)( img, topgrad, shape=filters_shape[1:4]) conv_gemm = conv_gemm.dimshuffle(0, 2, 3, 4, 1) f_ref = theano.function([], conv) f = theano.function([], conv_gemm, mode=mode_with_gpu) res_ref = f_ref() res = f() utt.assert_allclose(res_ref, res)
def test_opt_convtransp3d_fft(self): inputs_shape = (2, 9, 16, 12, 10) filters_shape = (10, 3, 8, 4, 1) inputs_val = numpy.random.random(inputs_shape).astype('float32') filters_val = numpy.random.random(filters_shape).astype('float32') bias = shared(numpy.zeros(filters_shape[4]).astype('float32')) inputs = shared(inputs_val) filters = shared(filters_val) conv = theano.tensor.nnet.convTransp3D(W=filters, b=bias, d=(1, 1, 1), H=inputs) mode = mode_with_gpu.including('convtransp3d_fft') f_ref = theano.function([], conv) f_fft = theano.function([], conv, mode=mode) # make sure we inserted the fft trickery topo = f_fft.maker.fgraph.toposort() assert sum(isinstance(n.op, theano.sandbox.cuda.fftconv.CuFFTOp) for n in topo) == 2 res_ref = f_ref() res_fft = f_fft() utt.assert_allclose(res_ref, res_fft, rtol=1e-04, atol=1e-04)
def test_concatenate(self): def ref(*inputs): axis = inputs[0] tensors = inputs[1:] return numpy.concatenate(tensors, axis) seed = utt.fetch_seed() rng = numpy.random.RandomState(seed) imgsize_list = ((5, 5), (6, 6), (6, 6), (8, 8)) n, c = 4, 2 axis = 1 image = T.dtensor4('image') image1 = T.dtensor4('image1') for imgsize in imgsize_list: imval = rng.rand(n, c, imgsize[0], imgsize[1]) output_ref = ref(axis, imval, imval) Opout = self.mkl_concatenate_func(axis, image, image1) f = function([image, image1], [Opout, ]) output_mkl = f(imval, imval) utt.assert_allclose(output_mkl, output_ref)
def test_relu_grad(self): seed = utt.fetch_seed() rng = numpy.random.RandomState(seed) imgsize_list = ((5, 5), (6, 6), (6, 6), (8, 8)) n, c = 4, 2 axis = 1 image = T.dtensor4('image') image1 = T.dtensor4('image1') for imgsize in imgsize_list: imval = rng.rand(n, c, imgsize[0], imgsize[1]) out = T.concatenate([image, image1], axis) sum_ref = T.sum(out) gx_ref = T.grad(sum_ref, [image, image1]) f_ref = theano.function([image, image1], outputs=gx_ref, mode=mode_without_mkl) output_ref = f_ref(imval, imval) out_mkl = self.mkl_concatenate_func(axis, image, image1) sum_mkl = T.sum(out_mkl) gx_mkl = T.grad(sum_mkl, [image, image1]) f_mkl = theano.function([image, image1], outputs=gx_mkl) output_mkl = f_mkl(imval, imval) utt.assert_allclose(output_mkl, output_ref)
def test_small_star(): from batman import _rsky m_star = 0.151 r_star = 0.189 period = 0.4626413 t0 = 0.2 b = 0.5 ecc = 0.1 omega = 0.1 t = np.linspace(0, period, 500) orbit = KeplerianOrbit( r_star=r_star, m_star=m_star, period=period, t0=t0, b=b, ecc=ecc, omega=omega) a = orbit.a.eval() incl = orbit.incl.eval() r_batman = _rsky._rsky(t, t0, period, a, incl, ecc, omega, 1, 1) m = r_batman < 100.0 assert m.sum() > 0 func = theano.function([], orbit.get_relative_position(t)) x, y, z = func() r = np.sqrt(x**2 + y**2) # Make sure that the in-transit impact parameter matches batman utt.assert_allclose(r_batman[m], r[m], atol=2e-5)
def test_sparseblockgemvF(self): """ Test the fortan order for W (which can happen in the grad for some graphs). """ b = tensor.fmatrix() W = tensor.ftensor4() h = tensor.ftensor3() iIdx = tensor.imatrix() oIdx = tensor.imatrix() o = self.gemv_op(b.take(oIdx, axis=0), tensor.DimShuffle((False, False, False, False), (0, 1, 3, 2)) (tensor.as_tensor_variable(W)), h, iIdx, oIdx) f = theano.function([W, h, iIdx, b, oIdx], o, mode=self.mode) W_val, h_val, iIdx_val, b_val, oIdx_val = \ BlockSparse_Gemv_and_Outer.gemv_data() th_out = f(numpy.swapaxes(W_val, 2, 3), h_val, iIdx_val, b_val, oIdx_val) ref_out = BlockSparse_Gemv_and_Outer.gemv_numpy( b_val.take(oIdx_val, axis=0), W_val, h_val, iIdx_val, oIdx_val) utt.assert_allclose(ref_out, th_out)
def cmp(n, m, f, f_gpu): data = numpy.arange(n * m, dtype='float32').reshape(n, m) gdata = numpy.asarray(data)[:, :, None, None] out = f(data) gout = numpy.asarray(f_gpu(gdata))[:, :, 0, 0] utt.assert_allclose(out, gout)
def cmp(n, m): data = numpy.random.uniform(1e-7, 1, (n, m)).astype(dtype=dtypeInput) b_data = numpy.random.uniform(1e-7, 1, (m,)).astype(dtype=dtypeBias) out = f(data, b_data) gout = f_gpu(data, b_data) utt.assert_allclose(out, gout)
def with_linker(self, linker, op, type, rand_val): for xsh, ysh in [((3, 5), (3, 5)), ((3, 5), (1, 5)), ((3, 5), (3, 1)), ((1, 5), (5, 1)), ((1, 1), (1, 1)), ((self.openmp_minsize,), (self.openmp_minsize,)), ((self.openmp_minsize_sqrt, self.openmp_minsize_sqrt), (self.openmp_minsize_sqrt, self.openmp_minsize_sqrt)), ((2, 3, 4, 5), (2, 3, 4, 5)), ((2, 3, 4, 5), (1, 3, 1, 5)), ((2, 3, 4, 5), (1, 1, 1, 1)), ((), ())]: x = type('float64', [(entry == 1) for entry in xsh])('x') y = type('float64', [(entry == 1) for entry in ysh])('y') e = op(scalar.add)(x, y) f = copy(linker).accept(FunctionGraph([x, y], [e])).make_function() xv = rand_val(xsh) yv = rand_val(ysh) zv = xv + yv unittest_tools.assert_allclose(f(xv, yv), zv) #test Elemwise.infer_shape #the Shape op don't implement c_code! if isinstance(linker, gof.PerformLinker): x = type('float64', [(entry == 1) for entry in xsh])('x') y = type('float64', [(entry == 1) for entry in ysh])('y') e = op(scalar.add)(x, y) f = copy(linker).accept(FunctionGraph( [x, y], [e.shape])).make_function() assert tuple(f(xv, yv)) == tuple(zv.shape)
def test_opt_convgrad3d_fft(self): inputs_shape = (2, 17, 15, 16, 1) filters_shape = (10, 6, 7, 4, 1) dCdH_shape = (inputs_shape[0], inputs_shape[1] - filters_shape[1] + 1, inputs_shape[2] - filters_shape[2] + 1, inputs_shape[3] - filters_shape[3] + 1, filters_shape[0]) inputs_val = numpy.random.random(inputs_shape).astype('float32') dCdH_val = numpy.random.random(dCdH_shape).astype('float32') inputs = shared(inputs_val) dCdH = shared(dCdH_val) conv = theano.tensor.nnet.convGrad3D(V=inputs, dCdH=dCdH, WShape=filters_shape, d=(1, 1, 1)) mode = mode_with_gpu.including('convgrad3d_fft') mode.check_py_code = False f_ref = theano.function([], conv, mode="FAST_RUN") f_fft = theano.function([], conv, mode=mode) # make sure we inserted the fft trickery topo = f_fft.maker.fgraph.toposort() assert sum(isinstance(n.op, theano.sandbox.cuda.fftconv.CuFFTOp) for n in topo) == 2 res_ref = f_ref() res_fft = f_fft() utt.assert_allclose(res_ref, res_fft, rtol=1e-04, atol=1e-04)
def test3(self): a = tensor.dvector() w2 = sort(a) f = theano.function([a], w2) gv = f(self.v_val) gt = np.sort(self.v_val) utt.assert_allclose(gv, gt)
def test_opt_full(self): inputs_shape = (5, 3, 7, 6) filters_shape = (2, 3, 3, 3) inputs_val = numpy.random.random(inputs_shape).astype('float32') filters_val = numpy.random.random(filters_shape).astype('float32') inputs = shared(inputs_val) filters = shared(filters_val) conv = theano.tensor.nnet.conv.conv2d(inputs, filters, border_mode='full') mode = mode_with_gpu.including('conv_fft_full') f_ref = theano.function([], conv) f_fft = theano.function([], conv, mode=mode) # make sure we inserted the fft trickery topo = f_fft.maker.fgraph.toposort() assert sum(isinstance(n.op, theano.sandbox.cuda.fftconv.CuFFTOp) for n in topo) == 2, topo res_ref = f_ref() res_fft = f_fft() utt.assert_allclose(res_ref, res_fft)
def test_None(self): a = tensor.dmatrix() l = sort(a, None) f = theano.function([a], l) gv = f(self.m_val) gt = np.sort(self.m_val, None) utt.assert_allclose(gv, gt)
def test_opt_convgrad3d_gemm(self): inputs_shape = (16, 10, 12, 16, 1) filters_shape = (10, 6, 12, 4, 1) dCdH_shape = (16, 5, 1, 13, 10) inputs_val = numpy.random.random(inputs_shape).astype('float32') dCdH_val = numpy.random.random(dCdH_shape).astype('float32') inputs = shared(inputs_val) dCdH = shared(dCdH_val) conv = theano.tensor.nnet.convGrad3D(V=inputs, dCdH=dCdH, WShape=filters_shape, d=(1, 1, 1)) mode = mode_with_gpu.including('convgrad3d_gemm') f_ref = theano.function([], conv) f_gemm = theano.function([], conv, mode=mode) # make sure we inserted the gemm trickery topo = f_gemm.maker.fgraph.toposort() assert sum(isinstance(n.op, GpuCorr3dMM_gradWeights) for n in topo) > 0 res_ref = f_ref() res_gemm = f_gemm() utt.assert_allclose(res_ref, res_gemm)
def test_1Drfft(self): inputs_val = np.random.random((1, N)).astype(theano.config.floatX) x = T.matrix('x') rfft = fft.rfft(x) f_rfft = theano.function([x], rfft) res_rfft = f_rfft(inputs_val) res_rfft_comp = (np.asarray(res_rfft[:, :, 0]) + 1j * np.asarray(res_rfft[:, :, 1])) rfft_ref = np.fft.rfft(inputs_val, axis=1) utt.assert_allclose(rfft_ref, res_rfft_comp) m = rfft.type() print(m.ndim) irfft = fft.irfft(m) f_irfft = theano.function([m], irfft) res_irfft = f_irfft(res_rfft) utt.assert_allclose(inputs_val, np.asarray(res_irfft)) # The numerical gradient of the FFT is sensitive, must set large # enough epsilon to get good accuracy. eps = 1e-1 def f_rfft(inp): return fft.rfft(inp) inputs_val = np.random.random((1, N)).astype(theano.config.floatX) utt.verify_grad(f_rfft, [inputs_val], eps=eps) def f_irfft(inp): return fft.irfft(inp) inputs_val = np.random.random((1, N // 2 + 1, 2)).astype(theano.config.floatX) utt.verify_grad(f_irfft, [inputs_val], eps=eps)
def test_DownsampleFactorMaxPaddingStride(self): ignore_border = True # padding does not support ignore_border=False rng = numpy.random.RandomState(utt.fetch_seed()) maxpoolsizes = [(3, 3), (4, 4), (3, 4), (4, 3), (2, 2)] stridesizes = [(2, 2), (2, 2), (1, 1), (1, 2), (2, 2)] paddingsizes = [(2, 2), (1, 2), (2, 1), (0, 0), (1, 1)] imgsizes = [(5, 5), (5, 5), (5, 6), (6, 5), (5, 5)] m = 4 # minibatch c = 2 # channel size images = tensor.dtensor4() for indx, mode in product( numpy.arange(len(maxpoolsizes)), ["max", "sum", "average_inc_pad", "average_exc_pad"] ): imgsize = imgsizes[indx] imval = rng.rand(m, c, imgsize[0], imgsize[1]) - 0.5 stridesize = stridesizes[indx] maxpoolsize = maxpoolsizes[indx] paddingsize = paddingsizes[indx] numpy_output_val = self.numpy_max_pool_2d_stride_padding( imval, maxpoolsize, ignore_border, stridesize, paddingsize, mode ) maxpool_op = DownsampleFactorMax( maxpoolsize, ignore_border=ignore_border, st=stridesize, padding=paddingsize, mode=mode )(images) f = function([images], maxpool_op) output_val = f(imval) utt.assert_allclose(output_val, numpy_output_val)
def test_irfft(self): inputs_val = np.random.random((1, N, N)).astype(theano.config.floatX) inputs = theano.shared(inputs_val) rfft = fft.rfft(inputs) f_rfft = theano.function([], rfft) res_fft = f_rfft() m = rfft.type() irfft = fft.irfft(m) f_irfft = theano.function([m], irfft) res_irfft = f_irfft(res_fft) utt.assert_allclose(inputs_val, np.asarray(res_irfft)) inputs_val = np.random.random((1, N, N, 2)).astype(theano.config.floatX) inputs = theano.shared(inputs_val) irfft = fft.irfft(inputs) f_irfft = theano.function([], irfft) res_irfft = f_irfft() inputs_ref = inputs_val[..., 0] + inputs_val[..., 1] * 1j irfft_ref = np.fft.irfftn(inputs_ref, axes=(1, 2)) utt.assert_allclose(irfft_ref, res_irfft, atol=1e-4, rtol=1e-4)
def cmp(a_shp, b_shp): a = numpy.random.randn(* a_shp).astype(numpy.float32) b = numpy.random.randn(* b_shp).astype(numpy.float32) x = tensor.ftensor3() y = tensor.ftensor3() f = theano.function([x, y], batched_dot(x, y), mode=mode_with_gpu) z0 = numpy.asarray(f(a, b)) ga = cuda_ndarray.CudaNdarray(a) gb = cuda_ndarray.CudaNdarray(b) z1 = numpy.asarray(f(ga, gb)) z_test = numpy.sum( a[:, :, :, None] * b[:, None, :, :], axis=-2) z1 = numpy.asarray(f(ga, gb)) z_test = numpy.sum( a[:, :, :, None] * b[:, None, :, :], axis=-2) unittest_tools.assert_allclose(z0, z_test) unittest_tools.assert_allclose(z1, z_test)
def test_opt_convtransp3d_gemm(self): inputs_shape = (16, 15, 12, 12, 10) filters_shape = (10, 6, 12, 4, 1) inputs_val = numpy.random.random(inputs_shape).astype('float32') filters_val = numpy.random.random(filters_shape).astype('float32') bias = shared(numpy.zeros(filters_shape[4]).astype('float32')) inputs = shared(inputs_val) filters = shared(filters_val) conv = theano.tensor.nnet.convTransp3D(W=filters, b=bias, d=(1, 1, 1), H=inputs) mode = mode_with_gpu.including('convtransp3d_gemm') f_ref = theano.function([], conv) f_gemm = theano.function([], conv, mode=mode) # make sure we inserted the gemm trickery topo = f_gemm.maker.fgraph.toposort() assert sum(isinstance(n.op, GpuCorr3dMM_gradInputs) for n in topo) > 0 res_ref = f_ref() res_gemm = f_gemm() utt.assert_allclose(res_ref, res_gemm)
def run_conv_full(self, inputs_shape, filters_shape, pad=False): inputs_val = numpy.random.random(inputs_shape).astype('float32') filters_val = numpy.random.random(filters_shape).astype('float32') inputs = shared(inputs_val) filters = shared(filters_val) bias = shared(numpy.zeros(filters_shape[4]).astype('float32')) conv_ref = theano.tensor.nnet.convTransp3D( W=filters, b=bias, d=(1, 1, 1), H=inputs) filters = filters.dimshuffle(4, 0, 1, 2, 3) inputs = inputs.dimshuffle(0, 4, 1, 2, 3) conv_fft = theano.sandbox.cuda.fftconv.conv3d_fft(inputs, filters, border_mode="full", pad_last_dim=pad) conv_fft = conv_fft.dimshuffle(0, 2, 3, 4, 1) f_ref = theano.function([], conv_ref) f_fft = theano.function([], conv_fft, mode=mode_with_gpu) res_ref = f_ref() res_fft = f_fft() utt.assert_allclose(res_ref, res_fft, rtol=1e-04, atol=1e-04)
def test_GpuCumsum2D(self): block_max_size = self.max_threads_dim0 * 2 x = T.fmatrix('x') for shape_axis, axis in zip([0, 1, 0, 1, 0], [0, 1, None, -1, -2]): f = theano.function([x], cumsum(x, axis=axis), mode=self.mode) assert [ n for n in f.maker.fgraph.toposort() if isinstance(n.op, GpuCumsum) ] # Extensive testing for the first 1025 sizes a_shape = [5, 5] a_shape[shape_axis] = 1025 a = np.random.random(a_shape).astype("float32") slices = [slice(None), slice(None)] for i in xrange(a.shape[shape_axis]): slices[shape_axis] = slice(i) fa = f(a[slices]) npa = np.cumsum(a[slices], axis=axis) utt.assert_allclose(npa, fa) # Use multiple GPU threadblocks a_shape = [5, 5] a_shape[shape_axis] = block_max_size + 2 a = np.random.random(a_shape).astype("float32") utt.assert_allclose(np.cumsum(a, axis=axis), f(a)) # Use multiple GPU gridblocks a_shape = [4, 4] a_shape[1 - shape_axis] = self.max_grid_size1 + 1 a = np.random.random(a_shape).astype("float32") utt.assert_allclose(np.cumsum(a, axis=axis), f(a), rtol=5e-5) # Use recursive cumsum a_shape = [3, 3] a_shape[shape_axis] = block_max_size * (block_max_size + 1) + 2 a = np.random.random(a_shape).astype("float32") a = np.sign(a - 0.5).astype( "float32") # Avoid floating point error utt.assert_allclose(np.cumsum(a, axis=axis), f(a))
def test_multinomial_input_dtype(): # This tests the MultinomialFromUniform Op directly, not going through the # multinomial() call in GPU random generation. for idtype in ['float32', 'float16', 'float64']: for odtype in ['float32', 'float16', 'float64', 'int32']: p = tensor.matrix('p', idtype) u = tensor.vector('u', idtype) # p = tensor.dmatrix('p') # u = tensor.dvector('u') m = theano.sandbox.multinomial.MultinomialFromUniform(odtype)(p, u) # the m*2 allows the multinomial to reuse output f = function([p, u], m * 2, allow_input_downcast=True, mode=mode_with_gpu) assert any([ type(node.op) is GPUAMultinomialFromUniform for node in f.maker.fgraph.toposort() ]) # test that both first and second samples can be drawn utt.assert_allclose(f([[1, 0], [0, 1]], [.1, .1]), [[2, 0], [0, 2]]) # test that both second labels can be drawn r = f([[.2, .8], [.3, .7]], [.31, .31]) utt.assert_allclose(r, [[0, 2], [0, 2]]) # test that both first labels can be drawn r = f([[.2, .8], [.3, .7]], [.21, .21]) utt.assert_allclose(r, [[0, 2], [2, 0]]) # change the size to make sure output gets reallocated ok # and also make sure that the GPU version doesn't screw up the # transposed-ness r = f([[.2, .8]], [.25]) utt.assert_allclose(r, [[0, 2]])
def test_float16(): # gemv (gemm called) float16_data = [ rand(3).astype("float16"), np.asarray(1, dtype=np.float32), rand(3, 3).astype("float16"), rand(3).astype("float16"), np.asarray(0.5, dtype=np.float32), ] float16_shared = [ gpuarray_shared_constructor(val, target=test_ctx_name) for val in float16_data ] o = gemv(*float16_shared) f = theano.function([], o, mode=mode_with_gpu) y, alpha, A, x, beta = float16_data out = f() utt.assert_allclose(np.asarray(out), alpha * np.dot(A, x) + beta * y) topo = f.maker.fgraph.toposort() assert any([isinstance(n.op, GpuGemm) for n in topo]) # gemm float16_data = [ rand(3, 3).astype("float16"), np.asarray(1, dtype=np.float32), rand(3, 3).astype("float16"), rand(3, 3).astype("float16"), np.asarray(0.5, dtype=np.float32), ] float16_shared = [ gpuarray_shared_constructor(val, target=test_ctx_name) for val in float16_data ] o = gpugemm_no_inplace(*float16_shared) f = theano.function([], o) y, alpha, A, x, beta = float16_data out = f() utt.assert_allclose(np.asarray(out), alpha * np.dot(A, x) + beta * y) # dot22 float16_data = [rand(3, 3).astype("float16"), rand(3, 3).astype("float16")] float16_shared = [gpuarray_shared_constructor(val) for val in float16_data] o = gpu_dot22(*float16_shared) f = theano.function([], o) x, y = float16_data out = f() utt.assert_allclose(np.asarray(out), np.dot(x, y))
def test_boolean_mask(): tensor = T.constant([0, 1, 2, 3], dtype=theano.config.floatX) mask = np.array([True, False, True, False]) masked = nn.utils.boolean_mask(tensor, mask) utt.assert_allclose(masked.eval(), (0, 2)) tensor = [[1, 2], [3, 4], [5, 6]] mask = np.array([True, False, True]) masked = nn.utils.boolean_mask(tensor, mask) utt.assert_allclose(masked.eval(), [[1, 2], [5, 6]]) tensor_np = np.random.rand(3, 4, 2).astype(theano.config.floatX) tensor = T.as_tensor(tensor_np) mask = T.all(tensor > .5, 2) masked = nn.utils.boolean_mask(tensor, mask) utt.assert_allclose(masked.eval(), tensor_np[np.all(tensor_np > .5, 2)])
def test_GpuCumsum1D(self): block_max_size = self.max_threads_dim0 * 2 x = T.fvector('x') f = theano.function([x], cumsum(x), mode=self.mode) assert [n for n in f.maker.fgraph.toposort() if isinstance(n.op, GpuCumsum)] # Extensive testing for the first 1025 sizes a = np.random.random(1025).astype("float32") for i in xrange(a.shape[0]): utt.assert_allclose(np.cumsum(a[:i]), f(a[:i])) # Use multiple GPU threadblocks a = np.random.random((block_max_size + 2, )).astype("float32") utt.assert_allclose(np.cumsum(a), f(a)) # Use recursive cumsum a = np.ones((block_max_size * (block_max_size + 1) + 2,), dtype="float32") utt.assert_allclose(np.cumsum(a), f(a))
def test_velocity(): t_tensor = tt.dvector() t = np.linspace(0, 100, 1000) m_planet = 0.1 m_star = 1.3 orbit = KeplerianOrbit( m_star=m_star, r_star=1.0, t0=0.5, period=100.0, ecc=0.1, omega=0.5, Omega=1.0, incl=0.25 * np.pi, m_planet=m_planet, ) star_pos = orbit.get_star_position(t_tensor) star_vel = theano.function([], orbit.get_star_velocity(t))() star_vel_expect = np.empty_like(star_vel) for i in range(3): g = theano.grad(tt.sum(star_pos[i]), t_tensor) star_vel_expect[i] = theano.function([t_tensor], g)(t) utt.assert_allclose(star_vel, star_vel_expect) planet_pos = orbit.get_planet_position(t_tensor) planet_vel = theano.function([], orbit.get_planet_velocity(t))() planet_vel_expect = np.empty_like(planet_vel) for i in range(3): g = theano.grad(tt.sum(planet_pos[i]), t_tensor) planet_vel_expect[i] = theano.function([t_tensor], g)(t) utt.assert_allclose(planet_vel, planet_vel_expect) pos = orbit.get_relative_position(t_tensor) vel = np.array(theano.function([], orbit.get_relative_velocity(t))()) vel_expect = np.empty_like(vel) for i in range(3): g = theano.grad(tt.sum(pos[i]), t_tensor) vel_expect[i] = theano.function([t_tensor], g)(t) utt.assert_allclose(vel, vel_expect)
def test_DownsampleFactorMax(self): rng = numpy.random.RandomState(utt.fetch_seed()) # generate random images maxpoolshps = ((1, 1), (2, 2), (3, 3), (2, 3)) imval = rng.rand(4, 2, 16, 16) images = tensor.dtensor4() for maxpoolshp, ignore_border, mode in product( maxpoolshps, [True, False], ['max', 'sum', 'average_inc_pad', 'average_exc_pad']): # print 'maxpoolshp =', maxpoolshp # print 'ignore_border =', ignore_border # Pure Numpy computation numpy_output_val = self.numpy_max_pool_2d(imval, maxpoolshp, ignore_border, mode=mode) output = pool_2d(images, maxpoolshp, ignore_border, mode=mode) f = function([ images, ], [ output, ]) output_val = f(imval) utt.assert_allclose(output_val, numpy_output_val) # Pool op maxpool_op = Pool(maxpoolshp, ignore_border=ignore_border, mode=mode)(images) output_shape = Pool.out_shape(imval.shape, maxpoolshp, ignore_border=ignore_border) utt.assert_allclose(numpy.asarray(output_shape), numpy_output_val.shape) f = function([images], maxpool_op) output_val = f(imval) utt.assert_allclose(output_val, numpy_output_val)
def test_GpuCumOp1D(self, mode): np_func = dict(add=np.cumsum, mul=np.cumprod)[mode] op_class = partial(self.op_class, mode=mode) block_max_size = self.max_threads_dim0 * 2 x = T.fvector('x') f = theano.function([x], op_class(axis=0)(x), mode=self.mode) assert [ n for n in f.maker.fgraph.toposort() if isinstance(n.op, GpuCumOp) ] # Extensive testing for the first 1025 sizes a = np.random.random(1025).astype("float32") for i in xrange(a.shape[0]): utt.assert_allclose(np_func(a[:i]), f(a[:i])) # Use multiple GPU threadblocks a = np.random.random((block_max_size + 2, )).astype("float32") utt.assert_allclose(np_func(a), f(a)) # Use recursive cumop a = np.ones((block_max_size * (block_max_size + 1) + 2, ), dtype="float32") utt.assert_allclose(np_func(a), f(a))
def test_los(self): f, _, in_args = self.get_args() in_args[-1] = np.ones_like(in_args[-1]) out = f(*in_args) utt.assert_allclose(0.0, out)
def test_basic(self): f, _, in_args = self.get_args() out = f(*in_args) utt.assert_allclose(0.0, out[0]) utt.assert_allclose(0.0, out[-1])
def _params_allgood(ishape, kshape, mode, subsample=(1, 1), img_stride=(1, 1), kern_stride=(1, 1), version=-1, verbose=0, random=True, print_=None, id=None, rtol=1e-5, atol=1e-8, nb_iter=0, ones=False, compile_kshp=None, theano_mode=None, cls=None): # # This function is the core of several of the big unit-test drivers, # but it can also be used very directly on its own to test a specific # kind of convolution. # # See `test_example` (above) for an example of how to use this directly. # # :param kshape: (4d)The shape of the kernel at run time. # :param compile_kshp: (2d) hardcode the shape of the kernel in # the generated code This is supposed to be # faster, but we need to check That we raise # an error if the input have the wrong shape. # if ones: assert not random npy_img = theano._asarray(numpy.ones(ishape), dtype='float32') npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32') elif random: npy_img = theano._asarray(numpy.random.rand(*ishape) + 1, dtype='float32') npy_kern = theano._asarray(numpy.random.rand(*kshape) - 2, dtype='float32') else: npy_img = theano._asarray(numpy.arange( numpy.prod(ishape)).reshape(ishape), dtype='float32') + 1 npy_kern = -( theano._asarray(numpy.arange(numpy.prod(kshape)).reshape(kshape), dtype='float32') + 1) img = cuda_ndarray.CudaNdarray(npy_img) kern = cuda_ndarray.CudaNdarray(npy_kern) # we take the stride after the transfert as we make c_contiguous # data on the GPU. if img_stride != (1, 1): img = img[:, :, ::img_stride[0], ::img_stride[1]] npy_img = npy_img[:, :, ::img_stride[0], ::img_stride[1]] if kern_stride != (1, 1): kern = kern[:, :, ::kern_stride[0], ::kern_stride[1]] npy_kern = npy_kern[:, :, ::kern_stride[0], ::kern_stride[1]] i = cuda.CudaNdarrayType(broadcastable=[sh == 1 for sh in npy_img.shape])() k = cuda.CudaNdarrayType(broadcastable=[sh == 1 for sh in npy_kern.shape])() op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode, subsample=subsample, version=version, verbose=verbose, kshp=compile_kshp)(i, k) f = theano.function([i, k], op, mode=theano_mode) if cls is not None: assert any([ isinstance(node.op, cls) for node in f.maker.fgraph.toposort() ]), "Cannot find class %r in %r" % (cls, f.maker.fgraph.toposort()) t2 = time.time() gpuval = f(img, kern) t3 = time.time() for i in range(nb_iter): gpuval2 = f(img, kern) assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all() gpuval = numpy.asarray(gpuval) # CPU val computed after GPU val to get the GPU errors. t0 = time.time() cpuval = py_conv(npy_img, npy_kern, mode, subsample) t1 = time.time() assert gpuval.shape == cpuval.shape, ("shape mismatch", gpuval.shape, cpuval.shape) assert_allclose(cpuval, gpuval, rtol=rtol, atol=atol) assert numpy.all(numpy.isfinite(gpuval)), gpuval assert [(sh == 1) is br for sh, br in zip(cpuval.shape[:2], op.type.broadcastable[:2])] if (t2 is not None): if mode == 'valid': approx_fp = cpuval.size * ishape[1] * kshape[2] * kshape[3] * 2 else: approx_fp = (ishape[0] * kshape[0] * kshape[1] * kshape[2] * kshape[3] * ishape[2] * ishape[3] * 2) approx_fp /= 1e6 cpu_mflops = approx_fp / (t1 - t0) gpu_mflops = approx_fp / (t3 - t2) if verbose > 0: print('%15s' % str(ishape), '%15s' % str(kshape), end=' ', file=sys.stdout) print('%12.5f %7.2f %7.2f %7.1f' % (approx_fp, cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1)), file=sys.stdout)
def test_DownsampleFactorMax(self): rng = numpy.random.RandomState(utt.fetch_seed()) # maxpool, input size examples = ( ((2, ), (16, )), ((2, ), ( 4, 16, )), ((2, ), ( 4, 2, 16, )), ((1, 1), (4, 2, 16, 16)), ((2, 2), (4, 2, 16, 16)), ((3, 3), (4, 2, 16, 16)), ((3, 2), (4, 2, 16, 16)), ((3, 2, 2), (3, 2, 16, 16, 16)), ((2, 3, 2), (3, 2, 16, 16, 16)), ((2, 2, 3), (3, 2, 16, 16, 16)), ((2, 2, 3, 2), (3, 2, 6, 6, 6, 5)), ) for example, ignore_border, mode in product( examples, [True, False], ['max', 'sum', 'average_inc_pad', 'average_exc_pad']): (maxpoolshp, inputsize) = example imval = rng.rand(*inputsize) images = theano.shared(imval) # Pure Numpy computation numpy_output_val = self.numpy_max_pool_nd(imval, maxpoolshp, ignore_border, mode=mode) # The pool_2d or pool_3d helper methods if len(maxpoolshp) == 2: output = pool_2d(images, maxpoolshp, ignore_border, mode=mode) f = function([], [ output, ]) output_val = f() utt.assert_allclose(output_val, numpy_output_val) elif len(maxpoolshp) == 3: output = pool_3d(images, maxpoolshp, ignore_border, mode=mode) f = function([], [ output, ]) output_val = f() utt.assert_allclose(output_val, numpy_output_val) # Pool op maxpool_op = Pool(ndim=len(maxpoolshp), ignore_border=ignore_border, mode=mode)(images, maxpoolshp) output_shape = Pool.out_shape(imval.shape, maxpoolshp, ndim=len(maxpoolshp), ignore_border=ignore_border) utt.assert_allclose(numpy.asarray(output_shape), numpy_output_val.shape) f = function([], maxpool_op) output_val = f() utt.assert_allclose(output_val, numpy_output_val)
def test_softmax_grad(self): def cmp(n, m, f, f_gpu): data = numpy.arange(n * m, dtype='float32').reshape(n, m) gdata = numpy.asarray(data)[:, :, None, None] out = f(data) gout = numpy.asarray(f_gpu(gdata))[:, :, 0, 0] utt.assert_allclose(out, gout) x = T.matrix('x', 'float32') x_gpu = T.tensor4('x_gpu', 'float32') f_z = T.nnet.softmax_op f_gpu = dnn.GpuDnnSoftmax('accurate', 'channel') # Verify the grad operation dims = (2, 3, 4, 5) gdata = numpy.arange(numpy.product(dims), dtype='float32').reshape(dims) T.verify_grad(f_gpu, [gdata], rng=numpy.random, mode=mode_with_gpu) # Verify that the CPU and GPU implementations return the same results # up to a tolerance. self._test_softmax(x, x_gpu, f_z, f_gpu, cmp) self._test_softmax(x, x, f_z, f_z, self._cmp) # Verify that the SoftmaxGrad -> Gpu[Dnn]SoftmaxGrad # optimization is applied when cudnn is required y = T.fvector('y') f = theano.function([y], T.grad(T.nnet.softmax(y).mean(), y), mode=mode_with_gpu) sorted_f = f.maker.fgraph.toposort() val = numpy.random.rand(5).astype('float32') out_dnn = f(val) assert (len( [i for i in sorted_f if isinstance(i.op, self.gpu_grad_op)]) == 1) assert (len([ i for i in sorted_f if isinstance(i.op, theano.tensor.nnet.SoftmaxGrad) ]) == 0) # Verify that the SoftmaxGrad -> Gpu[Dnn]SoftmaxGrad # optimization is not applied when cudnn is excluded or not # available mode_wo_cudnn = mode_with_gpu.excluding("cudnn") y = T.fvector('y') f = theano.function([y], T.grad(T.nnet.softmax(y).mean(), y), mode=mode_wo_cudnn) sorted_f = f.maker.fgraph.toposort() out_cpu = f(val) utt.assert_allclose(out_dnn, out_cpu) assert (len( [i for i in sorted_f if isinstance(i.op, self.gpu_grad_op)]) == 0) assert (len([ i for i in sorted_f if isinstance(i.op, theano.tensor.nnet.SoftmaxGrad) ]) == 1) # Verify that the SoftmaxGrad -> GpuDnnSoftmaxGrad do not # crash with manual graph y = T.fvector('y') o = theano.tensor.nnet.SoftmaxGrad()(y, y * 2) f = theano.function([y], o, mode=mode_with_gpu) sorted_f = f.maker.fgraph.toposort() assert (len( [i for i in sorted_f if isinstance(i.op, self.gpu_grad_op)]) == 1) assert (len([ i for i in sorted_f if isinstance(i.op, theano.tensor.nnet.SoftmaxGrad) ]) == 0)
def test_pooling(): if not dnn.dnn_available(test_ctx_name): raise SkipTest(dnn.dnn_available.msg) # 'average_exc_pad' is disabled for versions < 4004 if dnn.version(raises=False) < 4004: modes = ('max', 'average_inc_pad') else: modes = ('max', 'average_inc_pad', 'average_exc_pad') x = T.ftensor4() for mode, pad in product(modes, ((0, 0), (1, 0), (1, 0), (2, 3), (3, 2))): if mode == 'max': func = T.max else: func = T.mean if pad != (0, 0) and func is T.mean: continue for ws in (4, 2, 5): for stride in (2, 3): if stride > ws: continue if pad[0] > stride or pad[1] > stride: # Not implemented continue # We will check that the opt introduced it. out1 = pool_2d(x, (ws, ws), st=(stride, stride), ignore_border=True, padding=pad, mode=mode) out2 = pool_2d_i2n(x, ds=(ws, ws), strides=(stride, stride), pad=pad, pool_function=func) mode_without_gpu2 = mode_without_gpu.including() mode_without_gpu2.check_isfinite = False f1 = theano.function([x], out1, mode=mode_with_gpu) assert any([ isinstance(node.op, dnn.GpuDnnPool) for node in f1.maker.fgraph.apply_nodes ]) f2 = theano.function([x], out2, mode=mode_without_gpu2) assert not any([ isinstance(node.op, dnn.GpuDnnPool) for node in f2.maker.fgraph.apply_nodes ]) for shp in [ (1, 10, 100, 100), (1, 3, 99, 99), (32, 1, 147, 197), ]: data = numpy.random.normal(0, 1, shp).astype("float32") a = f1(data) b = f2(data) utt.assert_allclose(a, b) # Test the grad for shp in [(1, 1, 2, 2), (1, 1, 3, 3)]: data = numpy.random.normal(0, 1, shp).astype("float32") * 10 ws = 2 stride = 2 if pad[0] > stride or pad[1] > stride: # Not implemented continue # This test the CPU grad + opt + GPU implemtentation def fn(x): return pool_2d(x, (ws, ws), ignore_border=True, padding=pad, mode=mode) utt.verify_grad(fn, [data], cast_to_output_type=False, mode=mode_with_gpu) # Confirm that the opt would have inserted it. fg = theano.function([x], theano.grad(fn(x).sum(), x), mode=mode_with_gpu) assert any([ isinstance(node.op, dnn.GpuDnnPoolGrad) for node in fg.maker.fgraph.toposort() ]) # Test the GPU grad + GPU implementation def fn(x): dnn_op = dnn.dnn_pool(x, ws=(ws, ws), stride=(stride, stride), pad=pad, mode=mode) return dnn_op utt.verify_grad(fn, [data], cast_to_output_type=False, mode=mode_with_gpu) # Confirm that we get the good op. fg = theano.function([x], theano.grad(fn(x).sum(), x), mode=mode_with_gpu) assert any([ isinstance(node.op, dnn.GpuDnnPoolGrad) for node in fg.maker.fgraph.toposort() ]) g_out = fg(data) # Compare against the CPU result out = pool_2d(x, (ws, ws), padding=pad, ignore_border=True, mode=mode) fc = theano.function([x], theano.grad(out.sum(), x), mode=mode_without_gpu) if mode == 'max': assert any([ isinstance(node.op, MaxPoolGrad) for node in fc.maker.fgraph.toposort() ]) else: assert any([ isinstance(node.op, AveragePoolGrad) for node in fc.maker.fgraph.toposort() ]) c_out = fc(data) utt.assert_allclose(c_out, g_out)
def test_local_gpu_elemwise_0(): """ Test local_gpu_elemwise_0 when there is a dtype upcastable to float32 """ a = tensor.bmatrix() b = tensor.fmatrix() c = tensor.fmatrix() a_v = (numpy.random.rand(4, 5) * 10).astype("int8") b_v = (numpy.random.rand(4, 5) * 10).astype("float32") c_v = (numpy.random.rand(4, 5) * 10).astype("float32") # Due to optimization order, this composite is created when all # the op are on the gpu. f = theano.function([a, b, c], a + b + c, mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1 assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1 utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v) # Now test with the composite already on the cpu before we move it # to the gpu a_s = theano.scalar.int8() b_s = theano.scalar.float32() c_s = theano.scalar.float32() out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s + c_s]) out_op = tensor.Elemwise(out_s) f = theano.function([a, b, c], out_op(a, b, c), mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1 assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 1 utt.assert_allclose(f(a_v, b_v, c_v), a_v + b_v + c_v) # Test multiple output a_s = theano.scalar.float32() a = tensor.fmatrix() from theano.scalar.basic import identity out_s = theano.scalar.Composite( [a_s, b_s, c_s], [identity(a_s), identity(c_s), identity(b_s)]) outs_op = tensor.Elemwise(out_s) f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1 assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 0 out = f(a_v, b_v, c_v) utt.assert_allclose(out[0], a_v) utt.assert_allclose(out[1], c_v) utt.assert_allclose(out[2], b_v) # Test multiple output out_s = theano.scalar.Composite([a_s, b_s, c_s], [a_s + b_s, a_s * c_s]) outs_op = tensor.Elemwise(out_s) f = theano.function([a, b, c], outs_op(a, b, c), mode=mode_with_gpu) topo = f.maker.fgraph.toposort() assert sum(isinstance(node.op, cuda.GpuElemwise) for node in topo) == 1 assert sum(isinstance(node.op, tensor.Elemwise) for node in topo) == 0 out = f(a_v, b_v, c_v) utt.assert_allclose(out[0], a_v + b_v) utt.assert_allclose(out[1], a_v * c_v) # Test non-contiguous input c = cuda.shared_constructor(c_v) f = theano.function([a, b], outs_op(a[::2], b[::2], c[::2]), mode=mode_with_gpu) out = f(a_v, b_v) utt.assert_allclose(out[0], a_v[::2] + b_v[::2]) utt.assert_allclose(out[1], a_v[::2] * c_v[::2])
def test_conv3d(mode=mode_without_gpu, shared=theano.tensor._shared): if ndimage is None: raise SkipTest("conv3d2d tests need SciPy") Ns, Ts, C, Hs, Ws = 3, 10, 3, 32, 32 Nf, Tf, C, Hf, Wf = 32, 5, 3, 5, 5 signals = numpy.arange(Ns * Ts * C * Hs * Ws).reshape(Ns, Ts, C, Hs, Ws).astype('float32') filters = numpy.arange(Nf * Tf * C * Hf * Wf).reshape(Nf, Tf, C, Hf, Wf).astype('float32') t0 = time.time() pyres = pyconv3d(signals, filters) print(time.time() - t0) s_signals = shared(signals) s_filters = shared(filters) s_output = shared(signals * 0) out = conv3d(s_signals, s_filters, signals_shape=signals.shape, filters_shape=filters.shape) newconv3d = theano.function([], [], updates={s_output: out}, mode=mode) check_diagonal_subtensor_view_traces(newconv3d) t0 = time.time() newconv3d() print(time.time() - t0) utt.assert_allclose(pyres, s_output.get_value(borrow=True)) gsignals, gfilters = theano.grad(out.sum(), [s_signals, s_filters]) gnewconv3d = theano.function([], [], updates=[(s_filters, gfilters), (s_signals, gsignals)], mode=mode, name='grad') check_diagonal_subtensor_view_traces(gnewconv3d) t0 = time.time() gnewconv3d() print('grad', time.time() - t0) Ns, Ts, C, Hs, Ws = 3, 3, 3, 5, 5 Nf, Tf, C, Hf, Wf = 4, 2, 3, 2, 2 signals = numpy.random.rand(Ns, Ts, C, Hs, Ws).astype('float32') filters = numpy.random.rand(Nf, Tf, C, Hf, Wf).astype('float32') utt.verify_grad(conv3d, [signals, filters], eps=1e-1, mode=mode) # Additional Test that covers the case of patched implementation for filter with Tf=1 Ns, Ts, C, Hs, Ws = 3, 10, 3, 32, 32 Nf, Tf, C, Hf, Wf = 32, 1, 3, 5, 5 signals = numpy.arange(Ns * Ts * C * Hs * Ws).reshape(Ns, Ts, C, Hs, Ws).astype('float32') filters = numpy.arange(Nf * Tf * C * Hf * Wf).reshape(Nf, Tf, C, Hf, Wf).astype('float32') t0 = time.time() pyres = pyconv3d(signals, filters) print(time.time() - t0) s_signals = shared(signals) s_filters = shared(filters) s_output = shared(signals * 0) out = conv3d(s_signals, s_filters, signals_shape=signals.shape, filters_shape=filters.shape) newconv3d = theano.function([], [], updates={s_output: out}, mode=mode) t0 = time.time() newconv3d() print(time.time() - t0) utt.assert_allclose(pyres, s_output.get_value(borrow=True)) gsignals, gfilters = theano.grad(out.sum(), [s_signals, s_filters]) gnewconv3d = theano.function([], [], updates=[(s_filters, gfilters), (s_signals, gsignals)], mode=mode, name='grad') t0 = time.time() gnewconv3d() print('grad', time.time() - t0) Ns, Ts, C, Hs, Ws = 3, 3, 3, 5, 5 Nf, Tf, C, Hf, Wf = 4, 1, 3, 2, 2 signals = numpy.random.rand(Ns, Ts, C, Hs, Ws).astype('float32') filters = numpy.random.rand(Nf, Tf, C, Hf, Wf).astype('float32') utt.verify_grad(conv3d, [signals, filters], eps=1e-1, mode=mode)
def test_batch_normalization_test(): for axes in ("per-activation", "spatial", (1, 2, 3, 4)): for vartype in (T.tensor5, T.tensor3, T.vector): x, scale, bias, mean, var = (vartype(n) for n in ("x", "scale", "bias", "mean", "var")) ndim = x.ndim eps = 5e-3 # some non-standard value to test if it's used # remove non-existing axes if isinstance(axes, tuple): axes = tuple(i for i in axes if i < ndim) if len(axes) == 0: continue # forward pass out = bn.batch_normalization_test(x, scale, bias, mean, var, axes, eps) # reference forward pass if axes == "per-activation": axes2 = (0, ) elif axes == "spatial": axes2 = (0, ) + tuple(range(2, ndim)) else: axes2 = axes scale2, bias2, mean2, var2 = (T.addbroadcast(t, *axes2) for t in (scale, bias, mean, var)) out2 = (x - mean2) * (scale2 / T.sqrt(var2 + eps)) + bias2 # backward pass dy = vartype("dy") grads = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out: dy}) # reference backward pass grads2 = T.grad(None, wrt=[x, scale, bias, mean, var], known_grads={out2: dy}) # compile f = theano.function([x, scale, bias, mean, var, dy], [out, out2] + grads + grads2) # check if the abstract Ops have been replaced assert not any([ isinstance( n.op, ( bn.AbstractBatchNormTrain, bn.AbstractBatchNormInference, bn.AbstractBatchNormTrainGrad, ), ) for n in f.maker.fgraph.toposort() ]) # run for data_shape in ((10, 20, 30, 40, 10), (4, 3, 1, 1, 1), (1, 1, 5, 5, 5)): data_shape = data_shape[:ndim] param_shape = tuple(1 if d in axes2 else s for d, s in enumerate(data_shape)) X = 4 + 3 * np.random.randn(*data_shape).astype( theano.config.floatX) Dy = -1 + 2 * np.random.randn(*data_shape).astype( theano.config.floatX) Scale = np.random.randn(*param_shape).astype( theano.config.floatX) Bias = np.random.randn(*param_shape).astype( theano.config.floatX) Mean = np.random.randn(*param_shape).astype( theano.config.floatX) Var = np.random.rand(*param_shape).astype(theano.config.floatX) outputs = f(X, Scale, Bias, Mean, Var, Dy) # compare outputs utt.assert_allclose(outputs[0], outputs[1]) # out # compare gradients utt.assert_allclose(outputs[2], outputs[2 + 5], atol=4e-5) # dx utt.assert_allclose(outputs[3], outputs[3 + 5], atol=4e-5) # dscale utt.assert_allclose(outputs[4], outputs[4 + 5]) # dbias utt.assert_allclose(outputs[5], outputs[5 + 5]) # dmean utt.assert_allclose(outputs[6], outputs[6 + 5], rtol=2e-3, atol=4e-5) # dvar
def test_batch_normalization(): def bn_ref(x, G, B, M, V): n = (x - M) / V return n * G + B np.random.seed(1234) X = 1 + np.random.random([10, 20]).astype("float32") B = 1 + np.random.random([20]).astype("float32") G = 1 + np.random.random([20]).astype("float32") M = 1 + np.random.random([20]).astype("float32") V = 1 + np.random.random([20]).astype("float32") x = theano.tensor.matrix("x") b = theano.tensor.vector("b") g = theano.tensor.vector("g") m = theano.tensor.vector("m") v = theano.tensor.vector("v") bn_ref_op = bn_ref(x, g, b, m, v) f_ref = theano.function([x, g, b, m, v], [bn_ref_op]) res_ref = f_ref(X, G, B, M, V) for mode in ["low_mem", "high_mem"]: bn_op = bn.batch_normalization(x, g, b, m, v, mode=mode) f = theano.function([x, g, b, m, v], [bn_op]) res = f(X, G, B, M, V) utt.assert_allclose(res_ref, res) def bn_f(inputs, gamma, beta, mean, std): return bn.batch_normalization(inputs, gamma, beta, mean, std, mode=mode) utt.verify_grad(bn_f, [X, G, B, M, V]) bn_ref_op = bn_ref(x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True)) f_ref = theano.function([x, b, g], [bn_ref_op]) res_ref = f_ref(X, G, B) for mode in ["low_mem", "high_mem"]: bn_op = bn.batch_normalization( x, g, b, x.mean(axis=0, keepdims=True), x.std(axis=0, keepdims=True), mode=mode, ) f = theano.function([x, b, g], [bn_op]) res = f(X, G, B) utt.assert_allclose(res_ref, res) def bn_f(inputs, gamma, beta, mean, std): return bn.batch_normalization(inputs, gamma, beta, mean, std, mode=mode) utt.verify_grad( bn_f, [X, G, B, X.mean(axis=0)[np.newaxis], X.std(axis=0)[np.newaxis]])
def test_batch_normalization_train(): utt.seed_rng() for axes in ("per-activation", "spatial", (1, 2, 3, 4)): for vartype in (T.tensor5, T.tensor3, T.vector): x, scale, bias, running_mean, running_var = (vartype(n) for n in ( "x", "scale", "bias", "running_mean", "running_var")) ndim = x.ndim eps = 5e-3 # some non-standard value to test if it's used running_average_factor = 0.3 # remove non-existing axes if isinstance(axes, tuple): axes = tuple(i for i in axes if i < ndim) if len(axes) == 0: continue # forward pass ( out, x_mean, x_invstd, out_running_mean, out_running_var, ) = bn.batch_normalization_train( x, scale, bias, axes, eps, running_average_factor, running_mean, running_var, ) # reference forward pass if axes == "per-activation": axes2 = (0, ) elif axes == "spatial": axes2 = (0, ) + tuple(range(2, ndim)) else: axes2 = axes x_mean2 = x.mean(axis=axes2, keepdims=True) x_var2 = x.var(axis=axes2, keepdims=True) x_invstd2 = T.inv(T.sqrt(x_var2 + eps)) scale2 = T.addbroadcast(scale, *axes2) bias2 = T.addbroadcast(bias, *axes2) out2 = (x - x_mean2) * (scale2 * x_invstd2) + bias2 m = T.cast( T.prod(x.shape) / T.prod(scale.shape), theano.config.floatX) out_running_mean2 = (running_mean * (1 - running_average_factor) + x_mean2 * running_average_factor) out_running_var2 = (running_var * (1 - running_average_factor) + (m / (m - 1)) * x_var2 * running_average_factor) # backward pass dy = vartype("dy") grads = T.grad(None, wrt=[x, scale, bias], known_grads={out: dy}) # reference backward pass grads2 = T.grad(None, wrt=[x, scale, bias], known_grads={out2: dy}) # second-order backward pass dx = vartype("dinputs") dscale = vartype("dscale") dbias = vartype("dbias") grad_grads = T.grad( None, wrt=[x, dy, scale], known_grads=OrderedDict({ grads[0]: dx, grads[1]: dscale, grads[2]: dbias }), consider_constant=[ x, dy, scale, bias, x_mean, x_invstd, running_mean, running_var, ], return_disconnected="zero", ) # reference second-order backward pass grad_grads2 = T.grad( None, wrt=[x, dy, scale], known_grads=OrderedDict({ grads2[0]: dx, grads2[1]: dscale, grads2[2]: dbias }), consider_constant=[ x, dy, scale, bias, x_mean2, x_var2, running_mean, running_var, ], return_disconnected="zero", ) # compile f = theano.function( [ x, scale, bias, running_mean, running_var, dy, dx, dscale, dbias ], [ out, x_mean, x_invstd, out_running_mean, out_running_var, out2, x_mean2, x_invstd2, out_running_mean2, out_running_var2, ] + grads + grads2 + grad_grads + grad_grads2, ) # check if the abstract Ops have been replaced assert not any([ isinstance( n.op, ( bn.AbstractBatchNormTrain, bn.AbstractBatchNormInference, bn.AbstractBatchNormTrainGrad, ), ) for n in f.maker.fgraph.toposort() ]) # run for data_shape in ((5, 10, 30, 40, 10), (4, 3, 1, 1, 1), (2, 3, 5, 5, 5)): data_shape = data_shape[:ndim] param_shape = tuple(1 if d in axes2 else s for d, s in enumerate(data_shape)) X = 4 + 3 * np.random.randn(*data_shape).astype( theano.config.floatX) Dy = -1 + 2 * np.random.randn(*data_shape).astype( theano.config.floatX) Scale = np.random.randn(*param_shape).astype( theano.config.floatX) Bias = np.random.randn(*param_shape).astype( theano.config.floatX) Running_mean = np.random.randn(*param_shape).astype( theano.config.floatX) Running_var = np.random.randn(*param_shape).astype( theano.config.floatX) Dx = 4 + 3 * np.random.randn(*data_shape).astype( theano.config.floatX) Dscale = -1 + 2 * np.random.randn(*param_shape).astype( theano.config.floatX) Dbias = np.random.randn(*param_shape).astype( theano.config.floatX) outputs = f(X, Scale, Bias, Running_mean, Running_var, Dy, Dx, Dscale, Dbias) # compare outputs utt.assert_allclose(outputs[0], outputs[0 + 5]) # out utt.assert_allclose(outputs[1], outputs[1 + 5]) # mean utt.assert_allclose(outputs[2], outputs[2 + 5]) # invstd utt.assert_allclose(outputs[3], outputs[3 + 5]) # running_mean utt.assert_allclose(np.nan_to_num(outputs[4]), np.nan_to_num(outputs[4 + 5])) # running_var # compare gradients utt.assert_allclose(outputs[10], outputs[10 + 3], atol=1e-4) # dx utt.assert_allclose(outputs[11], outputs[11 + 3], rtol=2e-4, atol=1e-4) # dscale utt.assert_allclose(outputs[12], outputs[12 + 3]) # dbias # compare second-order gradients utt.assert_allclose(outputs[16], outputs[16 + 3], atol=1e-4) # ddx utt.assert_allclose(outputs[17], outputs[17 + 3]) # ddy utt.assert_allclose(outputs[18], outputs[18 + 3], rtol=3e-4, atol=1e-4) # ddscale
def test_conv_nnet1(): utt.seed_rng() rval_cpu = run_conv_nnet1(False) utt.seed_rng() rval_gpu = run_conv_nnet1(True) utt.assert_allclose(rval_cpu, rval_gpu, rtol=1e-4, atol=1e-6)
def cmp_run_conv_nnet2_classif(seed, isize, ksize, bsize, ignore_error=False, n_train=10, gpu_only=False, cpu_only=False, float_atol=1e-06, check_isfinite=True, pickle=False, verbose=0, version=-1): """Run the nnet2 function on 1 or 2 devices, and compares the results. float_atol: None mean use the default value. check_isfinite: the debug mode option. We forward this value to debug mode. For some parameter CrossentropyCategorical1Hot op generate inf when not optimized. """ if config.mode == 'DEBUG_MODE': n_train = 1 # Change global tolerance, used in DebugMode for instance orig_float32_atol = theano.tensor.basic.float32_atol try: if float_atol: # print "float_atol", float_atol theano.tensor.basic.float32_atol = float_atol if gpu_only and cpu_only: raise ValueError("Please use only one of cpu_only and gpu_only") elif cpu_only: use_gpu = False compare = False elif gpu_only: use_gpu = True compare = False else: compare = True if not compare: return run_conv_nnet2_classif( use_gpu=use_gpu, seed=seed, isize=isize, ksize=ksize, bsize=bsize, n_train=n_train, check_isfinite=check_isfinite, pickle=pickle, verbose=verbose, version=version) utt.seed_rng(seed) # Seeds numpy.random with seed train_cpu, params_cpu, x_shape, y_shape, mode_cpu = \ build_conv_nnet2_classif( use_gpu=False, isize=isize, ksize=ksize, n_batch=bsize, verbose=verbose, version=version, check_isfinite=check_isfinite) utt.seed_rng(seed) # Seeds numpy.random with seed train_gpu, params_gpu, x_shape_gpu, y_shape_gpu, mode_gpu = \ build_conv_nnet2_classif( use_gpu=True, isize=isize, ksize=ksize, n_batch=bsize, verbose=verbose, version=version, check_isfinite=check_isfinite) assert x_shape == x_shape_gpu assert y_shape == y_shape_gpu xval = my_rand(*x_shape) yval = my_rand(*y_shape) lr = theano._asarray(0.01, dtype='float32') time_cpu = 0 time_gpu = 0 for i in range(n_train): # Train one batch on CPU t0 = time.time() rval_cpu = train_cpu(xval, yval, lr)[0] t1 = time.time() time_cpu += (t1 - t0) # Train one batch on GPU t0 = time.time() rval_gpu = train_gpu(xval, yval, lr)[0] t1 = time.time() time_gpu += (t1 - t0) # Compare results if (verbose or not numpy.allclose(rval_cpu, rval_gpu, rtol=1e-5, atol=float_atol)): print("At batch:", i + 1) print("CPU:", rval_cpu) print("GPU:", rval_gpu) print("abs diff:", numpy.absolute(rval_gpu - rval_cpu)) print("rel diff:", numpy.absolute(( rval_gpu - rval_cpu) / rval_gpu)) if not ignore_error: utt.assert_allclose(rval_cpu, rval_gpu, rtol=1e-5, atol=float_atol) # Synchronize parameters to start from the same point next time if i < n_train - 1: for cpu_p, gpu_p in zip(params_cpu, params_gpu): cpu_p.set_value(gpu_p.get_value(borrow=False), borrow=True) finally: theano.tensor.basic.float32_atol = orig_float32_atol
def test_one_sequence_one_output_weights_gpu1(self): def f_rnn(u_t, x_tm1, W_in, W): return u_t * W_in + x_tm1 * W u = theano.tensor.fvector('u') x0 = theano.tensor.fscalar('x0') W_in = theano.tensor.fscalar('win') W = theano.tensor.fscalar('w') mode = mode_with_gpu.excluding('InputToGpuOptimizer') output, updates = theano.scan(f_rnn, u, x0, [W_in, W], n_steps=None, truncate_gradient=-1, go_backwards=False, mode=mode) output = GpuFromHost(test_ctx_name)(output) f2 = theano.function([u, x0, W_in, W], output, updates=updates, allow_input_downcast=True, mode=mode) rng = numpy.random.RandomState(utt.fetch_seed()) v_u = rng.uniform(size=(4, ), low=-5., high=5.) v_x0 = rng.uniform() W = rng.uniform() W_in = rng.uniform() v_u = numpy.asarray(v_u, dtype='float32') v_x0 = numpy.asarray(v_x0, dtype='float32') W = numpy.asarray(W, dtype='float32') W_in = numpy.asarray(W_in, dtype='float32') # compute the output in numpy v_out = numpy.zeros((4, )) v_out[0] = v_u[0] * W_in + v_x0 * W for step in range(1, 4): v_out[step] = v_u[step] * W_in + v_out[step - 1] * W theano_values = f2(v_u, v_x0, W_in, W) utt.assert_allclose(theano_values, v_out) # TO DEL topo = f2.maker.fgraph.toposort() scan_node = [ node for node in topo if isinstance(node.op, theano.scan_module.scan_op.Scan) ] assert len(scan_node) == 1 scan_node = scan_node[0] topo = f2.maker.fgraph.toposort() assert sum([isinstance(node.op, HostFromGpu) for node in topo]) == 0 assert sum([isinstance(node.op, GpuFromHost) for node in topo]) == 4 scan_node = [ node for node in topo if isinstance(node.op, theano.scan_module.scan_op.Scan) ] assert len(scan_node) == 1 scan_node = scan_node[0] scan_node_topo = scan_node.op.fn.maker.fgraph.toposort() # check that there is no gpu transfer in the inner loop. assert any( [isinstance(node.op, GpuElemwise) for node in scan_node_topo]) assert not any( [isinstance(node.op, HostFromGpu) for node in scan_node_topo]) assert not any( [isinstance(node.op, GpuFromHost) for node in scan_node_topo])
def test_dnn_conv_alpha_output_merge(): if not dnn.dnn_available(test_ctx_name): raise SkipTest(dnn.dnn_available.msg) img = T.ftensor4() kern = T.ftensor4() out = T.ftensor4() b = 1 c = 4 f = 3 ih = 5 iw = 8 kh = 2 kw = 6 img_val = numpy.random.random((b, c, ih, iw)).astype('float32') kern_val = numpy.random.random((f, c, kh, kw)).astype('float32') out_val = numpy.random.random( (b, f, ih - kh + 1, iw - kw + 1)).astype('float32') conv = dnn.dnn_conv(img, kern) gw = theano.grad(conv.sum(), kern) gi = theano.grad(conv.sum(), img) lr = numpy.asarray(0.05, dtype='float32') fr = lr * (conv + out) wr = kern + lr * gw ir = img + lr * gi f1 = theano.function([img, kern, out], [fr, wr, ir], mode=mode_with_gpu) assert isinstance(f1.maker.fgraph.outputs[0].owner.inputs[0].owner.op, dnn.GpuDnnConv) assert isinstance(f1.maker.fgraph.outputs[1].owner.inputs[0].owner.op, dnn.GpuDnnConvGradW) assert isinstance(f1.maker.fgraph.outputs[2].owner.inputs[0].owner.op, dnn.GpuDnnConvGradI) mode = mode_with_gpu mode = mode.excluding('local_dnn_conv_alpha_merge') mode = mode.excluding('local_dnn_convw_alpha_merge') mode = mode.excluding('local_dnn_convi_alpha_merge') mode = mode.excluding('local_dnn_conv_output_merge') mode = mode.excluding('local_dnn_convw_output_merge') mode = mode.excluding('local_dnn_convi_output_merge') f2 = theano.function([img, kern, out], [fr, wr, ir], mode=mode) assert not isinstance(f2.maker.fgraph.outputs[0].owner.inputs[0].owner.op, dnn.GpuDnnConv) assert not isinstance(f2.maker.fgraph.outputs[1].owner.inputs[0].owner.op, dnn.GpuDnnConvGradW) assert not isinstance(f2.maker.fgraph.outputs[2].owner.inputs[0].owner.op, dnn.GpuDnnConvGradI) out_f1 = f1(img_val, kern_val, out_val) out_f2 = f2(img_val, kern_val, out_val) assert len(out_f1) == len(out_f2) for v1, v2 in zip(out_f1, out_f2): utt.assert_allclose(v1, v2)
def test1(self): a = tensor.dmatrix() w = sort(a) f = theano.function([a], w) utt.assert_allclose(f(self.m_val), np.sort(self.m_val))
def check_equality_two_nd_array(a, b): utt.assert_allclose(a, b, atol=1e-5, rtol=1e-5) return True
def test_machine_translation(self): # This test case comes from https://github.com/rizar/scan-grad-speed and # is an example of actual computation done with scan in the context of # machine translation # # 'dim' has been reduced from 1000 to 5 to make the test run faster # Parameters from an actual machine tranlation run batch_size = 80 seq_len = 50 dim = 5 # Weight matrices U = theano.shared( np.random.normal(size=(dim, dim), scale=0.0001).astype(config.floatX)) U.name = "U" V = theano.shared(U.get_value()) V.name = "V" W = theano.shared(U.get_value()) W.name = "W" # Variables and their values x = T.tensor3("x") x_value = np.random.normal(size=(seq_len, batch_size, dim), scale=0.0001).astype(config.floatX) ri = T.tensor3("ri") ri_value = x_value zi = T.tensor3("zi") zi_value = x_value init = T.alloc(np.cast[config.floatX](0), batch_size, dim) def rnn_step1( # sequences x, ri, zi, # outputs_info h, ): pre_r = ri + h.dot(U) pre_z = zi + h.dot(V) r = T.nnet.sigmoid(pre_r) z = T.nnet.sigmoid(pre_z) after_r = r * h pre_h = x + after_r.dot(W) new_h = T.tanh(pre_h) res_h = z * new_h + (1 - z) * h return res_h # Compile the function twice, once with the optimization and once # without opt_mode = mode.including("scan") h, _ = theano.scan( rnn_step1, sequences=[x, ri, zi], n_steps=seq_len, outputs_info=init, name="fpass1", mode=opt_mode, ) cost = h[-1].sum() grad1 = T.grad(cost, [U, V, W]) f_opt = theano.function(inputs=[x, ri, zi], outputs=grad1, mode=opt_mode) no_opt_mode = mode.excluding("scanOp_pushout_output") h, _ = theano.scan( rnn_step1, sequences=[x, ri, zi], n_steps=seq_len, outputs_info=init, name="fpass1", mode=no_opt_mode, ) cost = h[-1].sum() grad1 = T.grad(cost, [U, V, W]) f_no_opt = theano.function(inputs=[x, ri, zi], outputs=grad1, mode=no_opt_mode) # Validate that the optimization has been applied scan_node_grad = [ node for node in f_opt.maker.fgraph.toposort() if isinstance(node.op, Scan) ][1] for output in scan_node_grad.op.outputs: assert not ( isinstance(output.owner.op, T.elemwise.Elemwise) and any([isinstance(i, T.Dot) for i in output.owner.inputs])) # Compare the outputs of the two functions on the same input data. f_opt_output = f_opt(x_value, ri_value, zi_value) f_no_opt_output = f_no_opt(x_value, ri_value, zi_value) utt.assert_allclose(f_opt_output, f_no_opt_output)
def test_DownsampleFactorMaxStride(self): rng = numpy.random.RandomState(utt.fetch_seed()) # maxpool, stride, ignore_border, input, output sizes examples = ( ((1, 1), (1, 1), True, (4, 10, 16, 16), (4, 10, 16, 16)), ((1, 1), (3, 3), True, (4, 10, 16, 16), (4, 10, 6, 6)), ((1, 1), (5, 7), True, (4, 10, 16, 16), (4, 10, 4, 3)), ((1, 1), (1, 1), False, (4, 10, 16, 16), (4, 10, 16, 16)), ((1, 1), (3, 3), False, (4, 10, 16, 16), (4, 10, 6, 6)), ((1, 1), (5, 7), False, (4, 10, 16, 16), (4, 10, 4, 3)), ((3, 3), (1, 1), True, (4, 10, 16, 16), (4, 10, 14, 14)), ((3, 3), (3, 3), True, (4, 10, 16, 16), (4, 10, 5, 5)), ((3, 3), (5, 7), True, (4, 10, 16, 16), (4, 10, 3, 2)), ((3, 3), (1, 1), False, (4, 10, 16, 16), (4, 10, 14, 14)), ((3, 3), (3, 3), False, (4, 10, 16, 16), (4, 10, 6, 6)), ((3, 3), (5, 7), False, (4, 10, 16, 16), (4, 10, 4, 3)), ((5, 3), (1, 1), True, (4, 10, 16, 16), (4, 10, 12, 14)), ((5, 3), (3, 3), True, (4, 10, 16, 16), (4, 10, 4, 5)), ((5, 3), (5, 7), True, (4, 10, 16, 16), (4, 10, 3, 2)), ((5, 3), (1, 1), False, (4, 10, 16, 16), (4, 10, 12, 14)), ((5, 3), (3, 3), False, (4, 10, 16, 16), (4, 10, 5, 6)), ((5, 3), (5, 7), False, (4, 10, 16, 16), (4, 10, 4, 3)), ((16, 16), (1, 1), True, (4, 10, 16, 16), (4, 10, 1, 1)), ((16, 16), (3, 3), True, (4, 10, 16, 16), (4, 10, 1, 1)), ((16, 16), (5, 7), True, (4, 10, 16, 16), (4, 10, 1, 1)), ((16, 16), (1, 1), False, (4, 10, 16, 16), (4, 10, 1, 1)), ((16, 16), (3, 3), False, (4, 10, 16, 16), (4, 10, 1, 1)), ((16, 16), (5, 7), False, (4, 10, 16, 16), (4, 10, 1, 1)), ((3, ), (5, ), True, (16, ), (3, )), ((3, ), (5, ), True, ( 2, 16, ), ( 2, 3, )), ((5, ), (3, ), True, ( 2, 3, 16, ), ( 2, 3, 4, )), ((5, 1, 3), (3, 3, 3), True, (2, 16, 16, 16), (2, 4, 6, 5)), ((5, 1, 3), (3, 3, 3), True, (4, 2, 16, 16, 16), (4, 2, 4, 6, 5)), ) for example, mode in product( examples, ['max', 'sum', 'average_inc_pad', 'average_exc_pad']): (maxpoolshp, stride, ignore_border, inputshp, outputshp) = example # generate random images imval = rng.rand(*inputshp) images = theano.shared(imval) # Pool op numpy_output_val = \ self.numpy_max_pool_nd_stride(imval, maxpoolshp, ignore_border, stride, mode) assert numpy_output_val.shape == outputshp, ( "outshape is %s, calculated shape is %s" % (outputshp, numpy_output_val.shape)) maxpool_op = \ Pool(ndim=len(maxpoolshp), ignore_border=ignore_border, mode=mode)(images, maxpoolshp, stride) f = function([], maxpool_op) output_val = f() utt.assert_allclose(output_val, numpy_output_val)
def _cmp(self, n, m, f, f_gpu): data = numpy.arange(n * m, dtype='float32').reshape(n, m) out = f(data) gout = f_gpu(data) utt.assert_allclose(out, gout)
def test_zca_dataset(): """ Tests the ZCA_Dataset class. """ # Preparation rng = np.random.RandomState([2014, 11, 4]) start = 0 stop = 990 num_examples = 1000 num_feat = 5 num_classes = 2 # random_dense_design_matrix has values that are centered and of # unit stdev, which is not useful to test the ZCA. # So, we replace its value by an uncentered uniform one. raw = random_dense_design_matrix(rng, num_examples, num_feat, num_classes) x = rng.uniform(low=-0.5, high=2.0, size=(num_examples, num_feat)) x = x.astype(np.float32) raw.X = x zca = ZCA(filter_bias=0.0) zca.apply(raw, can_fit=True) zca_dataset = ZCA_Dataset(raw, zca, start, stop) # Testing general behaviour mean = zca_dataset.X.mean(axis=0) var = zca_dataset.X.std(axis=0) assert_allclose(mean, np.zeros(num_feat), atol=1e-2) assert_allclose(var, np.ones(num_feat), atol=1e-2) # Testing mapback() y = zca_dataset.mapback(zca_dataset.X) assert_allclose(x[start:stop], y) # Testing mapback_for_viewer() y = zca_dataset.mapback_for_viewer(zca_dataset.X) z = x/np.abs(x).max(axis=0) assert_allclose(z[start:stop], y, rtol=1e-2) # Testing adjust_for_viewer() y = zca_dataset.adjust_for_viewer(x.T).T z = x/np.abs(x).max(axis=0) assert_allclose(z, y) # Testing adjust_to_be_viewed_with() y = zca_dataset.adjust_to_be_viewed_with(x, 2*x, True) z = zca_dataset.adjust_for_viewer(x) assert_allclose(z/2, y) y = zca_dataset.adjust_to_be_viewed_with(x, 2*x, False) z = x/np.abs(x).max() assert_allclose(z/2, y) # Testing has_targets() assert zca_dataset.has_targets()
def validate( self, image_shape, filter_shape, border_mode="valid", subsample=(1, 1, 1), input=None, filters=None, verify_grad=True, non_contiguous=False, filter_dilation=(1, 1, 1), ): """ :param image_shape: The constant shape info passed to corr3dMM. :param filter_shape: The constant shape info passed to corr3dMM. """ if not theano.config.cxx: pytest.skip("Need cxx for this test") N_image_shape = [ T.get_scalar_constant_value(T.as_tensor_variable(x)) for x in image_shape ] N_filter_shape = [ T.get_scalar_constant_value(T.as_tensor_variable(x)) for x in filter_shape ] if input is None: input = self.input if filters is None: filters = self.filters # THEANO IMPLEMENTATION # we create a symbolic function so that verify_grad can work def sym_Corr3dMM(input, filters): # define theano graph and function input.name = "input" filters.name = "filters" rval = corr3d.Corr3dMM(border_mode, subsample, filter_dilation)(input, filters) rval.name = "corr_output" return rval output = sym_Corr3dMM(input, filters) output.name = "Corr3dMM()(%s,%s)" % (input.name, filters.name) theano_corr = theano.function([input, filters], output, mode=self.mode) # initialize input and compute result image_data = np.random.random(N_image_shape).astype(self.dtype) filter_data = np.random.random(N_filter_shape).astype(self.dtype) image_data /= 10 filter_data /= 10 if non_contiguous: image_data = np.transpose(image_data, axes=(0, 1, 4, 3, 2)) image_data = image_data.copy() image_data = np.transpose(image_data, axes=(0, 1, 4, 3, 2)) filter_data = np.transpose(filter_data, axes=(0, 1, 4, 3, 2)) filter_data = filter_data.copy() filter_data = np.transpose(filter_data, axes=(0, 1, 4, 3, 2)) assert not image_data.flags["CONTIGUOUS"] assert not filter_data.flags["CONTIGUOUS"] theano_output = theano_corr(image_data, filter_data) # REFERENCE IMPLEMENTATION # Testing correlation, not convolution. Reverse filters. filter_data_corr = np.array(filter_data[:, :, ::-1, ::-1, ::-1], copy=True, order="C") orig_image_data = image_data img_shape3d = np.array(N_image_shape[-3:]) fil_shape3d = np.array(N_filter_shape[-3:]) dil_shape3d = np.array(filter_dilation) dil_fil_shape3d = (fil_shape3d - 1) * dil_shape3d + 1 subsample3d = np.array(subsample) if border_mode == "full": padHWD = dil_fil_shape3d - 1 elif border_mode == "valid": padHWD = np.array([0, 0, 0]) elif border_mode == "half": padHWD = np.floor(dil_fil_shape3d / 2).astype("int32") elif isinstance(border_mode, tuple): padHWD = np.array(border_mode) elif isinstance(border_mode, integer_types): padHWD = np.array([border_mode, border_mode, border_mode]) else: raise NotImplementedError( "Unsupported border_mode {}".format(border_mode)) out_shape3d = (np.floor( (img_shape3d + 2 * (padHWD) - dil_fil_shape3d) / subsample3d) + 1) # avoid numpy deprecation out_shape3d = out_shape3d.astype("int32") out_shape = (N_image_shape[0], N_filter_shape[0]) + tuple(out_shape3d) ref_output = np.zeros(out_shape) # loop over output feature maps ref_output.fill(0) image_data2 = np.zeros(( N_image_shape[0], N_image_shape[1], N_image_shape[2] + 2 * padHWD[0], N_image_shape[3] + 2 * padHWD[1], N_image_shape[4] + 2 * padHWD[2], )) image_data2[:, :, padHWD[0]:padHWD[0] + N_image_shape[2], padHWD[1]:padHWD[1] + N_image_shape[3], padHWD[2]:padHWD[2] + N_image_shape[4], ] = image_data image_data = image_data2 N_image_shape = image_data.shape for bb in range(N_image_shape[0]): for nn in range(N_filter_shape[0]): for im0 in range(N_image_shape[1]): filter3d = filter_data_corr[nn, im0, :, :, :] image3d = image_data[bb, im0, :, :, :] for row in range(ref_output.shape[2]): irow = row * subsample[0] # image row for col in range(ref_output.shape[3]): icol = col * subsample[1] # image col for slc in range(ref_output.shape[4]): islc = slc * subsample[2] # image slice ref_output[bb, nn, row, col, slc] += ( image3d[ irow:irow + dil_fil_shape3d[0]:filter_dilation[0], icol:icol + dil_fil_shape3d[1]:filter_dilation[1], islc:islc + dil_fil_shape3d[2]: filter_dilation[2], ] * filter3d[::-1, ::-1, ::-1]).sum() utt.assert_allclose(theano_output, ref_output) # TEST GRADIENT if verify_grad: utt.verify_grad(sym_Corr3dMM, [orig_image_data, filter_data], mode=self.mode)