def test_using_gpu_3(self): if theano.config.device.find('gpu') > -1: from theano import function, config, shared, sandbox, Out import theano.tensor as T import numpy import time vlen = 10 * 30 * 70 # 10 x #cores x # threads per core iters = 10 rng = numpy.random.RandomState(22) x = shared(numpy.asarray(rng.rand(vlen), config.floatX)) f = function([], Out(sandbox.cuda.basic_ops.gpu_from_host(T.exp(x)), borrow=True)) t0 = time.time() for i in xrange(iters): r = f() print 'Looping %d times took' % iters, time.time() - t0, 'seconds' print 'Result is', r print 'Numpy result is', numpy.asarray(r) if numpy.any( [isinstance(x.op, T.Elemwise) for x in f.maker.env.toposort()]): print 'Used the cpu' else: print 'Used the gpu' assert not numpy.any( [isinstance(x.op, T.Elemwise) for x in f.maker.env.toposort()])
def benchmark_shared_cpu(): vlen = 10 * 30 * 700 iters = 1000 rng = np.random.RandomState(22) x = shared(np.asarray(rng.rand(vlen), config.floatX)) f1 = function( [], tensor.exp(x) ) f2 = function( [], Out( tensor.exp(x), borrow=True ) ) t0 = time.time() for i in xrange(iters): r = f1() t1 = time.time() no_borrow = t1 - t0 t0 = time.time() for i in xrange(iters): r = f2() t1 = time.time() print('Looping', iters, 'times took', no_borrow, 'seconds without borrow', end='') print('and', t1 - t0, 'seconds with borrow.')
def makeFunc(inList, outList, updates): inputs = [] for i in inList: inputs.append(In(i, borrow=True, allow_downcast=True)) outputs = [] for o in outList: outputs.append(Out(o, borrow=True)) return function(inputs=inputs, outputs=outputs, updates=updates, allow_input_downcast=True)
P0 = float32((1.0/alpha)*eye(nRec2Out)) x = shared(x0) r = shared(tanh(x0)) z = shared(z0) P = shared(P0) dts = shared(dt) wo = shared(zeros((nRec2Out, 1), dtype=float32)) wf = shared(float32(2.0*(random.rand(N, 1))-0.5)) fti = T.scalar('fti') Ms = shared(M) xnew = (1.0 - dts) * x + T.dot(Ms, r * dts) + wf * dts * z[0][0] znew = T.dot(T.transpose(wo), r) update = function([],[Out(z, borrow=True)], updates=[(x, xnew), (r, T.tanh(x)), (z, znew)]) print "Update compiled" k = T.dot(P, r) rPr = T.dot(T.transpose(r), k) c = 1.0/(1.0 + rPr) Pnew = P - T.dot(k, k.T) * c[0][0] wonew = wo - (z[0][0] - fti) * k * c[0][0] learn = function([fti],[Out(wo, borrow=True)],updates=[(P, Pnew), (wo, wonew)]) print "Learn compiled" amp = 1.3 freq = 1/60.0 ft = (amp/1.0)*sin(1.0*math.pi*freq*simtime) + \
from theano import function, config, shared, sandbox, Out import theano.tensor as T import numpy import time vlen = 10 * 30 * 768 # 10 x # cores x # threads per core iters = 1000 rng = numpy.random.RandomState(22) x = shared(numpy.asarray(rng.rand(vlen), config.floatX)) f = function([], Out(sandbox.cuda.basic_ops.gpu_from_host(T.exp(x)), borrow=True)) print f.maker.fgraph.toposort() t0 = time.time() for i in xrange(iters): r = f() t1 = time.time() print 'Looping %d times took' % iters, t1 - t0, 'seconds' print 'Result is', r print 'Numpy result is', numpy.asarray(r) if numpy.any([isinstance(x.op, T.Elemwise) for x in f.maker.fgraph.toposort()]): print 'Used the cpu' else: print 'Used the gpu' この例では、0.05秒強で実行が終了し、CPUでの実装に比べて60倍以上の向上 になっている。 borrowフラグをFalseに設定:
def compileModel(data, nInputs, nOutputs, hiddenLayersSize=[1200, 1200], dropoutRates=[0.2, 0.5, 0.5], activation='relu', weightInitMode='normal', regularizer=0.0001): """ Creates a symbolic model given the specified parameters using Theano Output: A list containing three the training, validation and test compiled functions of Theano """ np.random.seed(815) x = T.matrix('x') y = T.wvector('y') learningRate = T.scalar('learningRate') regularization = T.scalar('regularization') #Data sets train_x, train_y = data[0] valid_x, valid_y = data[1] test_x, test_y = data[2] nnet = MLP(x, nInputs, hiddenLayersSize, nOutputs, dropoutRates=dropoutRates, activation=activation, weightInitMode=weightInitMode) loss = nnet.loss(y, regularization) error = nnet.error(y) gParams = T.grad(loss, nnet.params) weightUpdates = [(param, param - learningRate * gParam) for param, gParam in zip(nnet.params, gParams)] batchIndicesVecctor = T.ivector('batchIndicesVecctor') trainF = function([batchIndicesVecctor, learningRate, regularization], Out(sbasic.gpu_from_host(loss), borrow=True), updates=weightUpdates, givens={ x: train_x[batchIndicesVecctor], y: train_y[batchIndicesVecctor] }) validF = function([batchIndicesVecctor], Out(sbasic.gpu_from_host(T.cast(error, T.config.floatX)), borrow=True), givens={ x: valid_x[batchIndicesVecctor], y: valid_y[batchIndicesVecctor] }) testF = function([batchIndicesVecctor], Out(sbasic.gpu_from_host(T.cast(error, T.config.floatX)), borrow=True), givens={ x: test_x[batchIndicesVecctor], y: test_y[batchIndicesVecctor] }) return [trainF, validF, testF]
x = shared(x0) r = shared(tanh(x0)) z = shared(z0) P = shared(P0) dts = shared(dt) wo = shared(zeros((nRec2Out, 1), dtype=float32)) wf = shared(float32(2.0 * (random.rand(N, 1)) - 0.5)) fti = T.scalar('fti') Ms = shared(M) Mins = shared(Min) I = T.matrix('I') xnew = (1.0 - dts) * x + T.dot(Ms, r * dts) + T.dot(Mins, I) znew = T.dot(T.transpose(wo), r) update = function([I], [Out(z, borrow=True)], updates=[(x, xnew), (r, T.tanh(x)), (z, znew)], mode='PROFILE_MODE') print "Update compiled" k = T.dot(P, r) rPr = T.dot(T.transpose(r), k) c = 1.0 / (1.0 + rPr) Pnew = P - T.dot(k, k.T) * c[0][0] dw = (z[0][0] - fti) * k * c[0][0] wonew = wo - dw Mnew = Ms + T.tile(dw.T, (N, 1)) learn = function([fti], [Out(wo, borrow=True)], updates=[(P, Pnew), (wo, wonew), (Ms, Mnew)], mode='PROFILE_MODE')