def test_gemv_dot_strides(): # Reported in https://github.com/Theano/Theano/issues/6142 xv = rand(5) yv = rand(5, 1) x = gpuarray_shared_constructor(xv) y = gpuarray_shared_constructor(yv, broadcastable=(False, True)) f = theano.function([], tensor.dot(x, y[::-1]), mode=mode_with_gpu) out = f() utt.assert_allclose(out, np.dot(xv, yv[::-1]))
def test_float16(): # gemv (gemm called) float16_data = [ rand(3).astype("float16"), np.asarray(1, dtype=np.float32), rand(3, 3).astype("float16"), rand(3).astype("float16"), np.asarray(0.5, dtype=np.float32), ] float16_shared = [ gpuarray_shared_constructor(val, target=test_ctx_name) for val in float16_data ] o = gemv(*float16_shared) f = theano.function([], o, mode=mode_with_gpu) y, alpha, A, x, beta = float16_data out = f() utt.assert_allclose(np.asarray(out), alpha * np.dot(A, x) + beta * y) topo = f.maker.fgraph.toposort() assert any([isinstance(n.op, GpuGemm) for n in topo]) # gemm float16_data = [ rand(3, 3).astype("float16"), np.asarray(1, dtype=np.float32), rand(3, 3).astype("float16"), rand(3, 3).astype("float16"), np.asarray(0.5, dtype=np.float32), ] float16_shared = [ gpuarray_shared_constructor(val, target=test_ctx_name) for val in float16_data ] o = gpugemm_no_inplace(*float16_shared) f = theano.function([], o) y, alpha, A, x, beta = float16_data out = f() utt.assert_allclose(np.asarray(out), alpha * np.dot(A, x) + beta * y) # dot22 float16_data = [rand(3, 3).astype("float16"), rand(3, 3).astype("float16")] float16_shared = [gpuarray_shared_constructor(val) for val in float16_data] o = gpu_dot22(*float16_shared) f = theano.function([], o) x, y = float16_data out = f() utt.assert_allclose(np.asarray(out), np.dot(x, y))
def test_leaf_lstm(): from theano.gpuarray import gpuarray_shared_constructor from theano.gpuarray.tests.config import mode_with_gpu n_time = 5 n_batch = 3 n_input = 2 n_output = 3 xs_data = I.Normal()([n_time, n_batch, n_input]) ts_data = I.Normal()([n_time, n_batch, n_output]) h0_data = I.Normal()([n_batch, n_output]) c0_data = I.Normal()([n_batch, n_output]) xs = tt.tensor3("xs") ts = tt.tensor3("xs") h0 = tt.matrix("h0") c0 = tt.matrix("c0") givens = {xs: xs_data, ts: ts_data, h0: h0_data, c0: c0_data} def forward(fun): return theano.function([], fun(xs, h0, c0), givens=givens, on_unused_input='ignore', mode=mode_with_gpu)() lstm = L.LSTM(n_input, n_output, impl=L.RNNImpl.ref) ref_lstm = forward(lstm.ref_forward) fused_lstm = forward(lstm.fused_forward) for r, f in zip(ref_lstm, fused_lstm): numpy.testing.assert_array_almost_equal(r, f) def backward(fun, params): ys = fun(xs, h0, c0)[0] cost = tt.mean((ts - ys)**2) grad = tt.grad(cost, [xs, h0, c0] + params) return theano.function([], grad, givens=givens, on_unused_input='ignore', mode=mode_with_gpu)() fused_grad = backward(lstm.fused_forward, lstm.params) ref_grad = backward(lstm.ref_forward, lstm.params) for r, f in zip(ref_grad, fused_grad): numpy.testing.assert_array_almost_equal(r, f) lstm.impl = L.RNNImpl.cudnn # TODO: do this in cudnn_forward cudnn_lstm = forward(lstm.cudnn_forward) for r, c in zip(ref_lstm, cudnn_lstm): numpy.testing.assert_array_almost_equal(r, c, decimal=cudnn_decimal) cudnn_grad = backward(lstm.cudnn_forward, lstm.params) cudnn_grad = cudnn_grad[:3] + lstm._rnn_block.split_params( gpuarray_shared_constructor(cudnn_grad[3]), 0, [n_batch, n_input]) for r, c in zip(ref_grad, cudnn_grad): numpy.testing.assert_array_almost_equal(r, c, decimal=cudnn_decimal)
def test_gpu_cholesky_inplace(self): A = self.rand_symmetric(1000) A_gpu = gpuarray_shared_constructor(A) A_copy = A_gpu.get_value() C = GpuMagmaCholesky()(A_gpu) fn = theano.function([], C, mode=mode_with_gpu, updates=[(A_gpu, C)]) assert any([ node.op.inplace for node in fn.maker.fgraph.toposort() if isinstance(node.op, GpuMagmaCholesky) ]) fn() L = A_gpu.get_value() utt.assert_allclose(np.dot(L, L.T), A_copy, atol=1e-3)
def test_gpu_matrix_inverse_inplace(self): N = 1000 test_rng = np.random.RandomState(seed=1) A_val_gpu = gpuarray_shared_constructor( test_rng.rand(N, N).astype("float32") * 2 - 1) A_val_copy = A_val_gpu.get_value() A_val_gpu_inv = GpuMagmaMatrixInverse()(A_val_gpu) fn = theano.function([], A_val_gpu_inv, mode=mode_with_gpu, updates=[(A_val_gpu, A_val_gpu_inv)]) assert any([ node.op.inplace for node in fn.maker.fgraph.toposort() if isinstance(node.op, GpuMagmaMatrixInverse) ]) fn() utt.assert_allclose(np.eye(N), np.dot(A_val_gpu.get_value(), A_val_copy), atol=5e-3)
def shared(val): try: return gpuarray_shared_constructor(val) except TypeError: return theano.shared(val)
def test_leaf_gru(): from theano.gpuarray import gpuarray_shared_constructor from theano.gpuarray.tests.config import mode_with_gpu n_time = 5 n_batch = 3 n_input = 2 n_output = 3 xs_data = I.Normal()([n_time, n_batch, n_input]) h0_data = I.Normal()([n_batch, n_output]) ts_data = I.Normal()([n_time, n_batch, n_output]) xs = tt.tensor3("xs") h0 = tt.matrix("h0") ts = tt.tensor3("ts") givens = {xs: xs_data, h0: h0_data, ts: ts_data} def forward(fun): return theano.function([], fun(xs, h0), givens=givens, on_unused_input='ignore', mode=mode_with_gpu)() # NOTE: n_batch won't affect rnnblock (!?) gru = L.GRU(n_input, n_output, n_batch=1, impl=L.RNNImpl.auto) assert gru.impl != L.RNNImpl.auto gru.impl = L.RNNImpl.ref ref_ys = forward(gru.ref_forward) fused_ys = forward(gru.fused_forward) for r, f in zip(ref_ys, fused_ys): numpy.testing.assert_array_almost_equal(r, f) def backward(fun, params): ys = fun(xs, h0)[0] cost = tt.mean((ts - ys)**2) grad = tt.grad(cost, [xs, h0] + params) return theano.function([], grad, givens=givens, on_unused_input='ignore', mode=mode_with_gpu)() ref_grad = backward(gru.ref_forward, gru.params) fused_grad = backward(gru.fused_forward, gru.params) for r, f in zip(ref_grad, fused_grad): numpy.testing.assert_array_almost_equal(r, f) gru.impl = L.RNNImpl.cudnn assert gru.params == list(gru.get_params()) cudnn_ys = forward(gru.cudnn_forward) for r, c in zip(ref_ys, cudnn_ys): numpy.testing.assert_array_almost_equal(r, c, decimal=cudnn_decimal) cudnn_grad = backward(gru.cudnn_forward, gru.params) cudnn_grad = cudnn_grad[:2] + gru._rnn_block.split_params( gpuarray_shared_constructor(cudnn_grad[2]), 0, [n_batch, n_input]) for r, f in zip(ref_grad, cudnn_grad): numpy.testing.assert_array_almost_equal(r, f, decimal=cudnn_decimal) gru.impl = L.RNNImpl.fused fused_ys = forward(gru.fused_forward) for r, f in zip(ref_ys, fused_ys): numpy.testing.assert_array_almost_equal(r, f)