def run_gradinput(self, inputs_shape, filters_shape, subsample=(1, 1, 1)): inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)] filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)] inputs_val = np.random.random(inputs_shape).astype(config.floatX) filters_val = np.random.random(filters_shape).astype(config.floatX) inputs = gpuarray_shared_constructor(inputs_val) filters = gpuarray_shared_constructor(filters_val) bottom_height = (inputs_shape[2] - 1) * subsample[0] + filters_shape[2] bottom_width = (inputs_shape[3] - 1) * subsample[1] + filters_shape[3] bottom_depth = (inputs_shape[4] - 1) * subsample[2] + filters_shape[4] bottom_shape = gpuarray_shared_constructor( np.array([bottom_height, bottom_width, bottom_depth])) if subsample == (1, 1, 1): conv_ref = Corr3dMMGradInputs(subsample=subsample)( kern=ref_cast(filters), topgrad=ref_cast(inputs)) conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)( kern=filters, topgrad=inputs) else: conv_ref = Corr3dMMGradInputs(subsample=subsample)( kern=ref_cast(filters), topgrad=ref_cast(inputs), shape=bottom_shape) conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)( kern=filters, topgrad=inputs, shape=bottom_shape) f_ref = theano.function([], conv_ref, mode=mode_without_gpu) f = theano.function([], conv_gemm, mode=mode_with_gpu) res_ref = f_ref() res = f() utt.assert_allclose(res_ref, res)
def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape, subsample=(1, 1)): inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)] filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)] dCdH_shape = [dCdH_shape[i] for i in (0, 3, 1, 2)] inputs_val = np.random.random(inputs_shape).astype(config.floatX) dCdH_val = np.random.random(dCdH_shape).astype(config.floatX) inputs = gpuarray_shared_constructor(inputs_val) dCdH = gpuarray_shared_constructor(dCdH_val) shape = gpuarray_shared_constructor(np.array(filters_shape[2:])) if subsample == (1, 1): conv_ref = CorrMM_gradWeights(subsample=subsample)( ref_cast(inputs), ref_cast(dCdH) ) conv_gemm = GpuCorrMM_gradWeights(subsample=subsample)(inputs, dCdH) else: conv_ref = CorrMM_gradWeights(subsample=subsample)( ref_cast(inputs), ref_cast(dCdH), shape=shape ) conv_gemm = GpuCorrMM_gradWeights(subsample=subsample)( inputs, dCdH, shape=shape ) f_ref = theano.function([], conv_ref, mode=mode_without_gpu) f = theano.function([], conv_gemm, mode=mode_with_gpu) res_ref = f_ref() res = f() utt.assert_allclose(res_ref, res)
def test_gpuarray_shared_scalar(): # By default, we don't put scalar as shared variable on the GPU with pytest.raises(TypeError): gpuarray_shared_constructor(np.asarray(1, dtype="float32")) # But we can force that gpuarray_shared_constructor(np.asarray(1, dtype="float32"), target=test_ctx_name)
def run_conv_valid( self, inputs_shape, filters_shape, border_mode="valid", filter_dilation=(1, 1), subsample=(1, 1), unshared=False, verify_grad=False, ): inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)] if unshared: filters_shape = [filters_shape[i] for i in (0, 1, 2, 5, 3, 4)] else: filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)] inputs_val = np.random.random(inputs_shape).astype(config.floatX) filters_val = np.random.random(filters_shape).astype(config.floatX) inputs = gpuarray_shared_constructor(inputs_val) filters = gpuarray_shared_constructor(filters_val) conv_ref = CorrMM( border_mode=border_mode, filter_dilation=filter_dilation, subsample=subsample, unshared=unshared, )(ref_cast(inputs), ref_cast(filters)) f_ref = theano.function([], conv_ref, mode=mode_without_gpu) conv = GpuCorrMM( border_mode=border_mode, filter_dilation=filter_dilation, subsample=subsample, unshared=unshared, )(inputs, filters) f = theano.function([], conv, mode=mode_with_gpu) res_ref = f_ref() res = f() utt.assert_allclose(res_ref, res) if verify_grad: utt.verify_grad( GpuCorrMM( border_mode=border_mode, filter_dilation=filter_dilation, subsample=subsample, unshared=unshared, ), [inputs_val, filters_val], mode=mode_with_gpu, )
def test_overflow_gpu_new_backend(): seed = 12345 n_substreams = 7 curr_rstate = np.array([seed] * 6, dtype="int32") rstate = [curr_rstate.copy()] for j in range(1, n_substreams): rstate.append(rng_mrg.ff_2p72(rstate[-1])) rstate = np.asarray(rstate) rstate = gpuarray_shared_constructor(rstate) fct = functools.partial(GPUA_mrg_uniform.new, rstate, ndim=None, dtype="float32") # should raise error as the size overflows sizes = [ (2**31, ), (2**32, ), ( 2**15, 2**16, ), (2, 2**15, 2**15), ] rng_mrg_overflow(sizes, fct, mode, should_raise_error=True) # should not raise error sizes = [(2**5, ), (2**5, 2**5), (2**5, 2**5, 2**5)] rng_mrg_overflow(sizes, fct, mode, should_raise_error=False) # should support int32 sizes sizes = [(np.int32(2**10), ), (np.int32(2), np.int32(2**10), np.int32(2**10))] rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
def test_overflow_gpu_new_backend(): # run with THEANO_FLAGS=mode=FAST_RUN,init_gpu_device=cuda1,device=cpu from theano.gpuarray.tests.test_basic_ops import \ mode_with_gpu as mode from theano.gpuarray.type import gpuarray_shared_constructor seed = 12345 n_substreams = 7 curr_rstate = numpy.array([seed] * 6, dtype='int32') rstate = [curr_rstate.copy()] for j in range(1, n_substreams): rstate.append(rng_mrg.ff_2p72(rstate[-1])) rstate = numpy.asarray(rstate) rstate = gpuarray_shared_constructor(rstate) fct = functools.partial(rng_mrg.GPUA_mrg_uniform.new, rstate, ndim=None, dtype='float32') # should raise error as the size overflows sizes = [(2**31, ), (2**32, ), (2**15, 2**16,), (2, 2**15, 2**15)] rng_mrg_overflow(sizes, fct, mode, should_raise_error=True) # should not raise error sizes = [(2**5, ), (2**5, 2**5), (2**5, 2**5, 2**5)] rng_mrg_overflow(sizes, fct, mode, should_raise_error=False) # should support int32 sizes sizes = [(numpy.int32(2**10), ), (numpy.int32(2), numpy.int32(2**10), numpy.int32(2**10))] rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
def test_validate_input_types_gpuarray_backend(): from theano.sandbox.rng_mrg import mrg_uniform from theano.gpuarray.type import gpuarray_shared_constructor from theano.configparser import change_flags with change_flags(compute_test_value="raise"): rstate = np.zeros((7, 6), dtype="int32") rstate = gpuarray_shared_constructor(rstate) mrg_uniform.new(rstate, ndim=None, dtype="float32", size=(3, ))
def test_validate_input_types_gpuarray_backend(): from theano.sandbox.rng_mrg import mrg_uniform from theano.gpuarray.type import gpuarray_shared_constructor from theano.configparser import change_flags with change_flags(compute_test_value="raise"): rstate = numpy.zeros((7, 6), dtype="int32") rstate = gpuarray_shared_constructor(rstate) mrg_uniform.new(rstate, ndim=None, dtype="float32", size=(3,))
def test_set_value_non_contiguous(): s = gpuarray_shared_constructor( np.asarray([[1.0, 2.0], [1.0, 2.0], [5, 6]])) s.set_value(s.get_value(borrow=True, return_internal_type=True)[::2], borrow=True) assert not s.get_value(borrow=True, return_internal_type=True).flags["C_CONTIGUOUS"] # In the past, this failed s.set_value([[0, 0], [1, 1]])
def test_incsub_offset(): # Test for https://github.com/Theano/Theano/issues/5670 # Build a GPU variable which value will have an offset (x1) x = gpuarray_shared_constructor(np.zeros(5, dtype=theano.config.floatX)) x1 = x[1:] # Use inc_subtensor on it y = tensor.vector() z = tensor.inc_subtensor(x1[2:], y) # Use updates so that inc_subtensor can happen inplace f = theano.function([y], z, updates={x: z}, mode=mode_with_gpu) utt.assert_allclose(f([1, 2]), np.array([0, 0, 1, 2], dtype=theano.config.floatX))
def test_consistency_GPUA_parallel(): """ Verify that the random numbers generated by GPUA_mrg_uniform, in parallel, are the same as the reference (Java) implementation by L'Ecuyer et al. """ from theano.gpuarray.tests.test_basic_ops import \ mode_with_gpu as mode from theano.gpuarray.type import gpuarray_shared_constructor seed = 12345 n_samples = 5 n_streams = 12 n_substreams = 7 # 7 samples will be drawn in parallel samples = [] curr_rstate = numpy.array([seed] * 6, dtype='int32') for i in range(n_streams): stream_samples = [] rstate = [curr_rstate.copy()] for j in range(1, n_substreams): rstate.append(rng_mrg.ff_2p72(rstate[-1])) rstate = numpy.asarray(rstate) rstate = gpuarray_shared_constructor(rstate) new_rstate, sample = rng_mrg.GPUA_mrg_uniform.new(rstate, ndim=None, dtype='float32', size=(n_substreams,)) rstate.default_update = new_rstate # Not really necessary, just mimicking # rng_mrg.MRG_RandomStreams' behavior sample.rstate = rstate sample.update = (rstate, new_rstate) # We need the sample back in the main memory cpu_sample = tensor.as_tensor_variable(sample) f = theano.function([], cpu_sample, mode=mode) for k in range(n_samples): s = f() stream_samples.append(s) samples.append(numpy.array(stream_samples).T.flatten()) # next stream curr_rstate = rng_mrg.ff_2p134(curr_rstate) samples = numpy.array(samples).flatten() assert(numpy.allclose(samples, java_samples))
def __init__(self, input_size, hidden_size, dtype=theano.config.floatX): self.grub = dnn.RNNBlock(dtype=dtype, hidden_size=hidden_size, num_layers=1, rnn_mode='gru') self.input_size = input_size self.hidden_size = hidden_size psize = self.grub.get_param_size((1, input_size)) self.params = gpuarray_shared_constructor( np.zeros(psize, dtype=theano.config.floatX))
def __init__(self,rng,n_hidden,x, E,xmask,is_train,dropout,mode='lstm', n_layer=1, pre_state=None,**kwargs): self.is_train=is_train self.dropout=dropout self.rng=rng self.xmask=xmask shape=x.shape embd=E[x.flatten()] embd=embd.reshape([shape[0],shape[1],-1]) if pre_state==None: h0 = T.zeros((n_layer, shape[1], n_hidden), dtype=theano.config.floatX) pre_state = [h0, ] if mode=='lstm': c0 = T.zeros((n_layer, shape[1], n_hidden), dtype=theano.config.floatX) pre_state.append(c0) rnnb=dnn.RNNBlock(dtype=theano.config.floatX, hidden_size=n_hidden, num_layers=n_layer, rnn_mode=mode, input_mode='skip', direction_mode='unidirectional') psize=rnnb.get_param_size([1,n_hidden]) print psize params_cudnn = gpuarray_shared_constructor( np.zeros((psize,), dtype=theano.config.floatX) ) #l = np.sqrt(6.) / np.sqrt(4 * n_hidden) #pvalue = np.asarray(self.rng.uniform(low=-l, high=l, size=(psize,)), dtype=theano.config.floatX) #params_cudnn=gpuarray_shared_constructor(pvalue,name='cudnn') self.params=[params_cudnn,] if mode=='lstm': h=rnnb.apply(params_cudnn,embd,pre_state[0],pre_state[1])[0] else: h=rnnb.apply(params_cudnn,embd,pre_state[0])[0] h=h*self.xmask.dimshuffle(0,1,'x') # Dropout if self.dropout > 0: drop_mask = self.rng.binomial(n=1, p=1 - self.dropout, size=h.shape, dtype=theano.config.floatX) self.activation = T.switch(self.is_train, h * drop_mask, h * (1 - self.dropout)) else: self.activation = T.switch(self.is_train, h, h)
def test_consistency_GPUA_serial(): # Verify that the random numbers generated by GPUA_mrg_uniform, serially, # are the same as the reference (Java) implementation by L'Ecuyer et al. from theano.gpuarray.tests.config import mode_with_gpu as mode from theano.gpuarray.type import gpuarray_shared_constructor seed = 12345 n_samples = 5 n_streams = 12 n_substreams = 7 samples = [] curr_rstate = numpy.array([seed] * 6, dtype='int32') for i in range(n_streams): stream_rstate = curr_rstate.copy() for j in range(n_substreams): substream_rstate = numpy.array([stream_rstate.copy()], dtype='int32') # Transfer to device rstate = gpuarray_shared_constructor(substream_rstate) new_rstate, sample = rng_mrg.GPUA_mrg_uniform.new(rstate, ndim=None, dtype='float32', size=(1,)) rstate.default_update = new_rstate # Not really necessary, just mimicking # rng_mrg.MRG_RandomStreams' behavior sample.rstate = rstate sample.update = (rstate, new_rstate) # We need the sample back in the main memory cpu_sample = tensor.as_tensor_variable(sample) f = theano.function([], cpu_sample, mode=mode) for k in range(n_samples): s = f() samples.append(s) # next substream stream_rstate = rng_mrg.ff_2p72(stream_rstate) # next stream curr_rstate = rng_mrg.ff_2p134(curr_rstate) samples = numpy.array(samples).flatten() assert(numpy.allclose(samples, java_samples))
def test_blocksparse_grad_merge(self): b = tensor.fmatrix() h = tensor.ftensor3() iIdx = tensor.lmatrix() oIdx = tensor.lmatrix() W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data() W = gpuarray_shared_constructor(W_val, context=test_ctx_name) o = gpu_sparse_block_gemv(b.take(oIdx, axis=0), W, h, iIdx, oIdx) gW = theano.grad(o.sum(), W) lr = np.asarray(0.05, dtype="float32") upd = W - lr * gW f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode_with_gpu) # Make sure the lr update was merged. assert isinstance(f1.maker.fgraph.outputs[0].owner.op, GpuSparseBlockOuter) # Exclude the merge optimizations. mode = mode_with_gpu.excluding("local_merge_blocksparse_alpha") mode = mode.excluding("local_merge_blocksparse_output") f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode) # Make sure the lr update is not merged. assert not isinstance(f2.maker.fgraph.outputs[0].owner.op, GpuSparseBlockOuter) f2(h_val, iIdx_val, b_val, oIdx_val) W_ref = W.get_value() # reset the var W.set_value(W_val) f1(h_val, iIdx_val, b_val, oIdx_val) W_opt = W.get_value() utt.assert_allclose(W_ref, W_opt)
def test_Gpujoin_inplace(): # Test Gpujoin to work inplace. # # This function tests the case when several elements are passed to the # Gpujoin function but all except one of them are empty. In this case # Gpujoin should work inplace and the output should be the view of the # non-empty element. s = tt.lscalar() data = np.array([3, 4, 5], dtype=theano.config.floatX) x = gpuarray_shared_constructor(data, borrow=True) z = tt.zeros((s, )) join = GpuJoin(view=0) c = join(0, x, z) f = theano.function([s], theano.Out(c, borrow=True)) if not isinstance(mode_with_gpu, theano.compile.DebugMode): assert x.get_value(borrow=True, return_internal_type=True) is f(0) assert np.allclose(f(0), [3, 4, 5])
def _params_to_cudnn(self): from theano.gpuarray import dnn from theano.gpuarray.type import gpuarray_shared_constructor assert dnn.dnn_available(None) self._rnn_block = dnn.RNNBlock(theano.config.floatX, self.hidden_dim, num_layers=1, input_mode="linear", rnn_mode=self.rnn_type, direction_mode="unidirectional") param_size = self._rnn_block.get_param_size( [self.n_batch, self.input_dim]) # TODO: study about n_batch self.params = [gpuarray_shared_constructor(Constant(0.0)(param_size))] cs = self._rnn_block.split_params(self.params[0], layer=0, input_size=[ self.n_batch, self.input_dim ]) # TODO: multi layer support for c, p in zip(cs, self.non_cudnn_params): c[:] = p.get_value(borrow=True, return_internal_type=True)
def test_cpu_target_with_shared_variable(): srng = MRG_RandomStream() s = np.random.rand(2, 3).astype("float32") x = gpuarray_shared_constructor(s, name="x") try: # To have theano.shared(x) try to move on the GPU theano.compile.shared_constructor(gpuarray_shared_constructor) y = srng.uniform(x.shape, target="cpu") y.name = "y" z = (x * y).sum() z.name = "z" fz = theano.function([], z, mode=mode) nodes = fz.maker.fgraph.toposort() assert not any( [isinstance(node.op, GPUA_mrg_uniform) for node in nodes]) finally: theano.compile.shared_constructor(gpuarray_shared_constructor, remove=True)
def test_GPUA_full_fill(): # Make sure the whole sample buffer is filled. Also make sure # large samples are consistent with CPU results. import theano.gpuarray.tests.config from theano.gpuarray.type import gpuarray_shared_constructor # This needs to be large to trigger the problem on GPU size = (10, 1000) R = MRG_RandomStreams(234, use_cuda=False) uni = R.uniform(size, nstreams=60 * 256) f_cpu = theano.function([], uni) rstate_gpu = gpuarray_shared_constructor(R.state_updates[-1][0].get_value()) new_rstate, sample = rng_mrg.GPUA_mrg_uniform.new(rstate_gpu, ndim=None, dtype='float32', size=size) rstate_gpu.default_update = new_rstate f_gpu = theano.function([], sample) utt.assert_allclose(f_cpu(), f_gpu())
def test_elemwise_pow(): # Test that GpuElemwise(pow) can compile with any combination of integer # or float input dtype. dtypes = [ "uint8", "uint16", "uint32", "uint64", "int8", "int16", "int32", "int64", "float16", "float32", "float64", ] for dtype_base in dtypes: for dtype_exp in dtypes: # Compile a gpu function with the specified dtypes base_val = np.random.randint(0, 5, size=10).astype(dtype_base) exp_val = np.random.randint(0, 3, size=10).astype(dtype_exp) base = theano.tensor.vector(dtype=dtype_base) exp = gpuarray_shared_constructor(exp_val) assert exp.dtype == dtype_exp output = base**exp f = theano.function([base], output, mode=mode_with_gpu) # We don't transfer to the GPU when the output dtype is int* n = len([ n for n in f.maker.fgraph.apply_nodes if isinstance(n.op, GpuElemwise) ]) assert n == (output.dtype in tensor.float_dtypes) # Call the function to make sure the output is valid out = f(base_val) expected_out = base_val**exp_val assert_allclose(out, expected_out)
def __init__(self, rng, n_hidden, x, xmask, is_train, dropout, mode='gru', n_layer=1, pre_state=None, **kwargs): prefix = "BiGRU_" Wc = norm_weight(n_hidden * 2, n_hidden, name=prefix + 'Wc') bc = zero_bias(n_hidden, prefix + 'bc') self.is_train = is_train self.dropout = dropout self.rng = rng self.xmask = xmask if pre_state == None: h0 = T.zeros((n_layer, x.shape[1], n_hidden), dtype=theano.config.floatX) pre_state = [ h0, ] if mode == 'lstm': c0 = T.zeros((n_layer, x.shape[1], n_hidden), dtype=theano.config.floatX) pre_state.append(c0) rnnb = dnn.RNNBlock(dtype=theano.config.floatX, hidden_size=n_hidden, num_layers=n_layer, rnn_mode=mode, input_mode='skip', direction_mode='bidirectional') psize = rnnb.get_param_size([1, n_hidden]) print psize params_cudnn = gpuarray_shared_constructor( np.zeros((psize, ), dtype=theano.config.floatX)) #l = np.sqrt(6.) / np.sqrt(4 * n_hidden) #pvalue = np.asarray(self.rng.uniform(low=-l, high=l, size=(psize,)), dtype=theano.config.floatX) #params_cudnn=gpuarray_shared_constructor(pvalue,name='cudnn') self.params = [ params_cudnn, ] if mode == 'lstm': h = rnnb.apply(params_cudnn, x, pre_state[0], pre_state[1])[0] else: h = rnnb.apply(params_cudnn, x, pre_state[0])[0] h = h * self.xmask.dimshuffle(0, 1, 'x') self.context = h ctx_mean = (h * self.xmask[:, :, None]).sum(0) / self.xmask.sum(0)[:, None] self.activation = T.tanh(T.dot(ctx_mean, Wc) + bc) # Dropout if self.dropout > 0: drop_mask = self.rng.binomial(n=1, p=1 - self.dropout, size=h.shape, dtype=theano.config.floatX) self.activation = T.switch(self.is_train, h * drop_mask, h * (1 - self.dropout)) else: self.activation = T.switch(self.is_train, h, h)
def shared(x, **kwargs): return gpuarray_shared_constructor(x, target=test_ctx_name, **kwargs)
def __init__(self, num_layers=1, direction=0, **kwargs): # this has to be provided in THEANO_FLAGS as e.g. contexts=gpu0->cuda0 context_name = kwargs.get('device', str(theano.config.device)) #if context_name == 'cpu': # context_name = 'gpu0' kwargs['device'] = context_name #kwargs['n_out'] *= 2 super(RNNBlockLayer, self).__init__(**kwargs) self.params = {} #self.attrs['n_out'] /= 2 #self.set_attr('nout', self.attrs['n_out'] / 4) from theano.gpuarray import dnn from theano.gpuarray.type import gpuarray_shared_constructor from theano.tensor.extra_ops import cpu_contiguous #from theano.sandbox.cuda.basic_ops import gpu_contiguous rnnb = dnn.RNNBlock( dtype=theano.config.floatX, hidden_size=self.attrs['n_out'], num_layers=num_layers, rnn_mode='lstm', input_mode='linear', direction_mode='unidirectional' if direction != 0 else 'bidirectional', context_name=context_name if context_name != 'cpu' else 'gpu0') buffer_size = 1 # self.attrs['n_out'] * num_layers #X = self.get_linear_forward_output() #X = T.concatenate([s.output for s in self.sources],axis=2)[::direction or 1] X = cpu_contiguous( T.concatenate([s.output for s in self.sources], axis=2)[::direction or 1]) #X = cpu_contiguous(self.sources[0].output[::direction or 1]) #X = T.concatenate([X,T.zeros((X.shape[0],batch_size - X.shape[1] + 1,X.shape[2]),X.dtype)],axis=1)[:,:-1] n_in = sum([s.attrs['n_out'] for s in self.sources]) psize = rnnb.get_param_size([buffer_size, n_in]) l = numpy.sqrt(6.) / numpy.sqrt(4 * self.attrs['n_out']) pvalue = numpy.asarray(self.rng.uniform(low=-l, high=l, size=(psize, )), dtype=theano.config.floatX) if context_name == 'cpu': params_cudnn = self.add_param( self.create_bias(psize, name='cudnn_%s' % self.name)) else: params_cudnn = self.add_param( gpuarray_shared_constructor(pvalue, target=context_name, name='cudnn_%s' % self.name)) c_init = cpu_contiguous( T.alloc(numpy.cast[theano.config.floatX](0), num_layers, X.shape[1], self.attrs['n_out'])) h_init = cpu_contiguous( T.alloc(numpy.cast[theano.config.floatX](0), num_layers, X.shape[1], self.attrs['n_out'])) W_out = self.add_param( self.create_random_uniform_weights( self.attrs['n_out'], self.y_in[self.attrs['target']].n_out)) b_out = self.add_param( self.create_bias(self.y_in[self.attrs['target']].n_out)) if context_name == 'cpu': self.cost_val = T.constant(0) self.error_val = T.constant(0) self.known_grads = {} return out = rnnb.apply(params_cudnn, X, h_init, c_init)[0] out = out[::-1] out = T.dot(out, W_out) + b_out self.y_m = out.reshape((out.shape[0] * out.shape[1], out.shape[2])) self.i = (self.index.flatten() > 0).nonzero() self.y_data_flat = self.y_in[self.attrs['target']].flatten() nll, _ = T.nnet.crossentropy_softmax_1hot( x=self.y_m[self.i], y_idx=self.y_data_flat[self.i]) self.cost_val = T.sum(nll) #self.cost_val = -T.sum(T.log(out[:,self.y_in[self.attrs['target']].flatten()][(self.index.flatten()>0).nonzero()])) self.known_grads = {params_cudnn: T.grad(self.cost_val, params_cudnn)} self.output = out self.index = self.sources[0].index self.error_val = T.sum( T.neq(T.argmax(self.y_m[self.i], axis=-1), self.y_data_flat[self.i]))
def test_dnn_rnn_lstm(): if not dnn.dnn_available(test_ctx_name): raise SkipTest(dnn.dnn_available.msg) utt.seed_rng() # test params input_dim = 32 hidden_dim = 16 batch_size = 2 depth = 3 timesteps = 5 # test code X = T.tensor3('X') Y = T.tensor3('Y') h0 = T.tensor3('h0') c0 = T.tensor3('c0') rnnb = dnn.RNNBlock(theano.config.floatX, hidden_dim, depth, 'lstm') psize = rnnb.get_param_size([batch_size, input_dim]) params_cudnn = gpuarray_shared_constructor( np.zeros((psize, ), dtype=theano.config.floatX)) model = Model() last_layer = WrapperLayer(X) last_dim = input_dim for i in range(depth): lstm = LSTM(last_dim, hidden_dim, last_layer, s0=h0[i, :, :], c0=c0[i, :, :]) model.add_layer(lstm) last_layer = lstm last_dim = hidden_dim layer_params = lstm.get_params() dnn_params = rnnb.split_params(params_cudnn, i, [batch_size, input_dim]) for j, p in enumerate(dnn_params): p[:] = layer_params[j].get_value(borrow=True, return_internal_type=True) def funcs(out, params): fn = theano.function([X, h0, c0], out, mode=mode_with_gpu) cost = T.mean((Y - out)**2) grad = T.grad(cost, [X, h0, c0] + params) grad_fn = theano.function([X, Y, h0, c0], grad, mode=mode_with_gpu) return fn, grad_fn ref_fn, ref_grad_fn = funcs(last_layer.output(), model.get_params()) cudnn_fn, cudnn_grad_fn = funcs( rnnb.apply(params_cudnn, X, h0, c0)[0], [params_cudnn]) x_val = np.random.random( (timesteps, batch_size, input_dim)).astype(theano.config.floatX) y_val = np.random.random( (timesteps, batch_size, hidden_dim)).astype(theano.config.floatX) h0_val = np.random.random( (depth, batch_size, hidden_dim)).astype(theano.config.floatX) c0_val = np.random.random( (depth, batch_size, hidden_dim)).astype(theano.config.floatX) ref_out = ref_fn(x_val, h0_val, c0_val) cudnn_out = cudnn_fn(x_val, h0_val, c0_val) utt.assert_allclose(ref_out, cudnn_out) ref_grads = ref_grad_fn(x_val, y_val, h0_val, c0_val) cudnn_grads = cudnn_grad_fn(x_val, y_val, h0_val, c0_val) utt.assert_allclose(ref_grads[0], cudnn_grads[0]) utt.assert_allclose(ref_grads[1], cudnn_grads[1]) utt.assert_allclose(ref_grads[2], cudnn_grads[2]) ref_grads_params = ref_grads[3:] cudnn_grads_params = gpuarray_shared_constructor(cudnn_grads[3]) for i in range(depth): cudnn_grads_layer = rnnb.split_params(cudnn_grads_params, i, [batch_size, input_dim]) ref_grads_layer = ref_grads_params[i * len(cudnn_grads_layer):(i + 1) * len(cudnn_grads_layer)] for j, g in enumerate(cudnn_grads_layer): utt.assert_allclose(ref_grads_layer[j], g)
def test_validate_input_types_gpuarray_backend(): with config.change_flags(compute_test_value="raise"): rstate = np.zeros((7, 6), dtype="int32") rstate = gpuarray_shared_constructor(rstate) rng_mrg.mrg_uniform.new(rstate, ndim=None, dtype="float32", size=(3, ))
X = T.tensor3('X') Y = T.tensor3('Y') h0 = T.tensor3('h0') c0 = T.tensor3('c0') rnnb = dnn.RNNBlock( theano.config.floatX, hidden_dim, depth, network_type, input_mode='skip' ) psize = rnnb.get_param_size([batch_size, hidden_dim]) params_cudnn = gpuarray_shared_constructor( np.zeros((psize,), dtype=theano.config.floatX) ) # lstm = LSTM(input_dim, hidden_dim) output = rnnb.apply(params_cudnn, X, h0, c0)[0] # Only hidden states cost = T.mean((Y - output) ** 2) grads = T.grad(cost, params_cudnn) cudnn_fn = theano.function( inputs=[], outputs=output, mode=mode_with_gpu, givens={X: x_val, h0: h0_val, c0: c0_val} ) cudnn_grad_fn = theano.function( inputs=[], outputs=grads,