class TestGroupGpuCorr2d(TestGroupedConvNoOptim): mode = mode_with_gpu.excluding("cudnn") conv_op = GpuCorrMM conv_gradw_op = GpuCorrMM_gradWeights conv_gradi_op = GpuCorrMM_gradInputs flip_filter = True is_dnn = False
def setup_method(self): self.mode = mode_with_gpu.excluding("constant_folding") self.gemv_op = gpu_sparse_block_gemv self.outer_op = gpu_sparse_block_outer self.gemv_class = GpuSparseBlockGemv self.outer_class = GpuSparseBlockOuter super().setup_method()
def run_gpu_cholesky(self, A_val, lower=True): A = aesara.tensor.fmatrix("A") f = aesara.function( [A], GpuMagmaCholesky(lower=lower)(A), mode=mode_with_gpu.excluding("cusolver"), ) return f(A_val)
def test_gpu_cholesky_opt(self): A = aesara.tensor.matrix("A", dtype="float32") fn = aesara.function([A], cholesky(A), mode=mode_with_gpu.excluding("cusolver")) assert any( [ isinstance(node.op, GpuMagmaCholesky) for node in fn.maker.fgraph.toposort() ] )
def test_pool_c_interface(self): gpu_mode = mode_with_gpu.excluding("cudnn") gpu_mode.check_py_code = False shp = (2, 2, 2, 2) inp = aesara.shared(rand(*shp), "a") inp = aet.as_tensor_variable(inp) with pytest.raises(ValueError): # test when ignore_border and pad >= 0 ds_op = GpuPool(ignore_border=False, ndim=2) pad = aet.as_tensor_variable([1, 1]) f = aesara.function([], ds_op(inp, [2, 2], pad=pad), mode=gpu_mode) f()
def test_pool_big_ws(self): gpu_mode = mode_with_gpu.excluding("cudnn") gpu_mode.check_py_code = False shp = (2, 2, 2, 2) inp = aesara.shared(rand(*shp), "a") inp = aet.as_tensor_variable(inp) ds_op = GpuPool(ignore_border=False, mode="average_exc_pad", ndim=2) pad = aet.as_tensor_variable([0, 0]) f = aesara.function( [], ds_op(inp, [5, 5], stride=[1, 1], pad=pad), mode=gpu_mode ) f()
def setup_method(self): self.mode = mode_with_gpu.excluding("constant_folding") self.join_op = GpuJoin() self.split_op_class = GpuSplit # Use join instead of MakeVector since there is no MakeVector on GPU self.make_vector_op = GpuJoin() # this is to avoid errors with limited devices self.floatX = "float32" self.hide_error = aesara.config.mode not in ["DebugMode", "DEBUG_MODE"] def shared(x, **kwargs): return gpuarray_shared_constructor(x, target=test_ctx_name, **kwargs) self.shared = shared
def test_transfer_gpu_gpu(): g = GpuArrayType(dtype="float32", broadcastable=(False, False), context_name=test_ctx_name)() av = np.asarray(rng.rand(5, 4), dtype="float32") gv = gpuarray.array(av, context=get_context(test_ctx_name)) mode = mode_with_gpu.excluding("cut_gpua_host_transfers", "local_cut_gpua_host_gpua") f = theano.function([g], GpuToGpu(test_ctx_name)(g), mode=mode) topo = f.maker.fgraph.toposort() assert len(topo) == 1 assert isinstance(topo[0].op, GpuToGpu) fv = f(gv) assert GpuArrayType.values_eq(fv, gv)
def test_magma_opt_float16(self): ops_to_gpu = [ (MatrixInverse(), GpuMagmaMatrixInverse), (SVD(), GpuMagmaSVD), (QRFull(mode="reduced"), GpuMagmaQR), (QRIncomplete(mode="r"), GpuMagmaQR), # TODO: add support for float16 to Eigh numpy # (Eigh(), GpuMagmaEigh), (Cholesky(), GpuMagmaCholesky), ] for op, gpu_op in ops_to_gpu: A = aesara.tensor.matrix("A", dtype="float16") fn = aesara.function([A], op(A), mode=mode_with_gpu.excluding("cusolver")) assert any( [isinstance(node.op, gpu_op) for node in fn.maker.fgraph.toposort()] )
def test_shape(): x = GpuArrayType(dtype="float32", broadcastable=[False, False, False])() v = gpuarray.zeros((3, 4, 5), dtype="float32", context=get_context(test_ctx_name)) f = aesara.function([x], x.shape) topo = f.maker.fgraph.toposort() assert np.all(f(v) == (3, 4, 5)) if aesara.config.mode != "FAST_COMPILE": assert len(topo) == 4 assert isinstance(topo[0].op, tt.opt.Shape_i) assert isinstance(topo[1].op, tt.opt.Shape_i) assert isinstance(topo[2].op, tt.opt.Shape_i) assert isinstance(topo[3].op, tt.opt.MakeVector) mode = mode_with_gpu.excluding("local_shape_to_shape_i") f = aesara.function([x], x.shape, mode=mode) topo = f.maker.fgraph.toposort() assert np.all(f(v) == (3, 4, 5)) assert len(topo) == 1 assert isinstance(topo[0].op, tt.Shape)
def test_blocksparse_grad_merge(self): b = tensor.fmatrix() h = tensor.ftensor3() iIdx = tensor.lmatrix() oIdx = tensor.lmatrix() W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data() W = gpuarray_shared_constructor(W_val, context=test_ctx_name) o = gpu_sparse_block_gemv(b.take(oIdx, axis=0), W, h, iIdx, oIdx) gW = theano.grad(o.sum(), W) lr = np.asarray(0.05, dtype="float32") upd = W - lr * gW f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode_with_gpu) # Make sure the lr update was merged. assert isinstance(f1.maker.fgraph.outputs[0].owner.op, GpuSparseBlockOuter) # Exclude the merge optimizations. mode = mode_with_gpu.excluding("local_merge_blocksparse_alpha") mode = mode.excluding("local_merge_blocksparse_output") f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode) # Make sure the lr update is not merged. assert not isinstance(f2.maker.fgraph.outputs[0].owner.op, GpuSparseBlockOuter) f2(h_val, iIdx_val, b_val, oIdx_val) W_ref = W.get_value() # reset the var W.set_value(W_val) f1(h_val, iIdx_val, b_val, oIdx_val) W_opt = W.get_value() utt.assert_allclose(W_ref, W_opt)
class TestGpuReduceDtype(test_elemwise.TestReduceDtype): mode = mode_with_gpu.excluding("local_cut_useless_reduce") # GpuDnnReduction doesn't cover all cases, but should cover some op = (GpuCAReduceCuda, GpuDnnReduction) # Currently we don't support reduction on 0 axis axes = [None, 0, 1, 1, [0], [1], [0, 1]] # We don't support complex dtype dtypes = [ "int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64", "float32", "float64", ] def setup_method(self): if get_context(test_ctx_name).kind != b"cuda": pytest.skip("Cuda specific tests")
import numpy as np import tests.unittest_tools as utt import theano import theano.tensor as tt from tests.gpuarray.config import mode_with_gpu, mode_without_gpu from theano.gpuarray.nnet import ( GpuCrossentropySoftmax1HotWithBiasDx, GpuCrossentropySoftmaxArgmax1HotWithBias, GpuSoftmax, GpuSoftmaxWithBias, ) from theano.tensor.nnet import crossentropy_softmax_1hot_with_bias_dx mode_wo_cudnn = mode_with_gpu.excluding("cudnn") def test_GpuCrossentropySoftmaxArgmax1HotWithBias(): # This is basic test for GpuCrossentropySoftmaxArgmax1HotWithBias # We check that we loop when their is too much threads n_in = 1000 batch_size = 4097 n_out = 1250 if not isinstance(mode_with_gpu, theano.compile.DebugMode): n_in = 4098 n_out = 4099 y = tt.lvector("y")
def test_pool2d(): shps = [ (1, 12), (1, 1, 12), (1, 1, 1, 12), (1, 1, 2, 2), (1, 1, 1, 1), (1, 1, 4, 4), (1, 1, 10, 11), (1, 2, 2, 2), (3, 5, 4, 4), (25, 1, 7, 7), (1, 1, 12, 12), (1, 1, 2, 14), (1, 1, 12, 14), (1, 1, 14, 14), (1, 1, 16, 16), (1, 1, 18, 18), (1, 1, 24, 24), (1, 6, 24, 24), (10, 1, 24, 24), (10, 6, 24, 24), (30, 6, 12, 12), (30, 2, 24, 24), (30, 6, 24, 24), (10, 10, 10, 11), (1, 1, 10, 1025), (1, 1, 10, 1023), (1, 1, 1025, 10), (1, 1, 1023, 10), (3, 2, 16, 16, 16), (3, 2, 6, 6, 6, 5), (3, 2, 6, 6, 6, 5, 7), ] np.random.RandomState(utt.fetch_seed()).shuffle(shps) test_ws = (2, 2), (3, 2), (1, 1) test_st = (2, 2), (3, 2), (1, 1) test_mode = ["max", "sum", "average_inc_pad", "average_exc_pad"] ref_mode = copy.copy(mode_without_gpu) ref_mode.check_py_code = False gpu_mode = mode_with_gpu.excluding("cudnn") gpu_mode.check_py_code = False for shp in shps: for mode, ws, st in itertools.product(test_mode, test_ws, test_st): if ws[0] > shp[-2] or ws[1] > shp[-1]: continue for ignore_border, pad in zip((True, False), [(1, 1), (0, 0)]): if pad[0] >= ws[0] or pad[1] >= ws[1]: continue if mode == "average_exc_pad" and (pad[0] > 0 or pad[1] > 0): continue # print('test_pool2d', shp, ws, st, pad, mode, ignore_border) ds_op = Pool(ndim=len(ws), mode=mode, ignore_border=ignore_border) a = aesara.shared(rand(*shp), "a") a_pooled = ds_op(aet.as_tensor_variable(a), ws, st, pad) f = aesara.function([], a_pooled, mode=gpu_mode) f2 = aesara.function([], a_pooled, mode=ref_mode) assert any( [isinstance(node.op, GpuPool) for node in f.maker.fgraph.toposort()] ) assert any( [isinstance(node.op, Pool) for node in f2.maker.fgraph.toposort()] ) assert np.allclose(f(), f2()), (shp, ws, st, pad, mode, ignore_border) a_pooled_grad = grad(a_pooled.sum(), a) g = aesara.function([], a_pooled_grad, mode=gpu_mode) g2 = aesara.function([], a_pooled_grad, mode=ref_mode) if mode == "max": gop = GpuMaxPoolGrad gop2 = MaxPoolGrad else: gop = GpuAveragePoolGrad gop2 = AveragePoolGrad assert any( [isinstance(node.op, gop) for node in g.maker.fgraph.toposort()] ) assert any( [isinstance(node.op, gop2) for node in g2.maker.fgraph.toposort()] ) assert np.allclose(g(), g2()), (shp, ws, st, pad, mode, ignore_border) # test rop and grad grad for max pooling # for average pooling grad grad is just average pooling grad if mode != "max": continue ea = aesara.shared(rand(*shp), "ea") gr = aesara.function([], Rop(a_pooled, a, ea), mode=gpu_mode) gr2 = aesara.function([], Rop(a_pooled, a, ea), mode=ref_mode) assert any( [ isinstance(node.op, GpuDownsampleFactorMaxGradGrad) for node in gr.maker.fgraph.toposort() ] ) assert any( [ isinstance(node.op, DownsampleFactorMaxGradGrad) for node in gr2.maker.fgraph.toposort() ] ) assert np.allclose(gr(), gr2()), (shp, ws, st, pad, mode, ignore_border) ggf = Lop(grad((a_pooled ** 2).sum(), a), a, a) gg = aesara.function([], ggf, mode=gpu_mode) gg2 = aesara.function([], ggf, mode=ref_mode) assert any( [ isinstance(node.op, GpuDownsampleFactorMaxGradGrad) for node in gg.maker.fgraph.toposort() ] ) assert any( [ isinstance(node.op, DownsampleFactorMaxGradGrad) for node in gg2.maker.fgraph.toposort() ] ) assert np.allclose(gg(), gg2()), (shp, ws, st, pad, mode, ignore_border)
class TestGroupGpuCorr3d(TestGroupedConv3dNoOptim): mode = mode_with_gpu.excluding("cudnn") conv_op = GpuCorr3dMM conv_gradw_op = GpuCorr3dMM_gradWeights conv_gradi_op = GpuCorr3dMM_gradInputs
def setup_class(cls): super().setup_class() cls.shared = staticmethod(gpuarray_shared_constructor) cls.mode = mode_with_gpu.excluding("cudnn")
class TestFusion(test_opt.TestFusion): mode = mode_with_gpu.excluding("local_dnn_reduction") _shared = staticmethod(gpuarray_shared_constructor) topo_exclude = (GpuFromHost, HostFromGpu)
def test_one_sequence_one_output_weights_gpu1(self): def f_rnn(u_t, x_tm1, W_in, W): return u_t * W_in + x_tm1 * W u = fvector("u") x0 = fscalar("x0") W_in = fscalar("win") W = fscalar("w") mode = mode_with_gpu.excluding("InputToGpuOptimizer") output, updates = scan( f_rnn, u, x0, [W_in, W], n_steps=None, truncate_gradient=-1, go_backwards=False, mode=mode, ) output = GpuFromHost(test_ctx_name)(output) f2 = aesara.function( [u, x0, W_in, W], output, updates=updates, allow_input_downcast=True, mode=mode, ) rng = np.random.RandomState(utt.fetch_seed()) v_u = rng.uniform(size=(4, ), low=-5.0, high=5.0) v_x0 = rng.uniform() W = rng.uniform() W_in = rng.uniform() v_u = np.asarray(v_u, dtype="float32") v_x0 = np.asarray(v_x0, dtype="float32") W = np.asarray(W, dtype="float32") W_in = np.asarray(W_in, dtype="float32") # compute the output in numpy v_out = np.zeros((4, )) v_out[0] = v_u[0] * W_in + v_x0 * W for step in range(1, 4): v_out[step] = v_u[step] * W_in + v_out[step - 1] * W aesara_values = f2(v_u, v_x0, W_in, W) utt.assert_allclose(aesara_values, v_out) # TO DEL topo = f2.maker.fgraph.toposort() scan_node = [ node for node in topo if isinstance(node.op, scan.op.Scan) ] assert len(scan_node) == 1 scan_node = scan_node[0] topo = f2.maker.fgraph.toposort() assert sum([isinstance(node.op, HostFromGpu) for node in topo]) == 0 assert sum([isinstance(node.op, GpuFromHost) for node in topo]) == 4 scan_node = [ node for node in topo if isinstance(node.op, scan.op.Scan) ] assert len(scan_node) == 1 scan_node = scan_node[0] scan_node_topo = scan_node.op.fn.maker.fgraph.toposort() # check that there is no gpu transfer in the inner loop. assert any( [isinstance(node.op, GpuElemwise) for node in scan_node_topo]) assert not any( [isinstance(node.op, HostFromGpu) for node in scan_node_topo]) assert not any( [isinstance(node.op, GpuFromHost) for node in scan_node_topo])