Python float32_shared_constructor示例，theano.sandbox.cuda.var.float32_shared_constructor Python示例

示例#1

0

显示文件

文件： test_gpu_unshared_conv.py 项目： saikrishnar/TheanoLinear

 def setUp(self):
     self.gwa = GpuWeightActs(module_stride=self.module_stride,
                              partial_sum=self.partial_sum)
     self.gpu_images = float32_shared_constructor(
         numpy.random.rand(*self.ishape).astype(self.dtype))
     self.gpu_hidact = float32_shared_constructor(
         numpy.random.rand(*self.hshape).astype(self.dtype))

示例#2

0

显示文件

        def run_match(self,
                      images,
                      filters,
                      module_stride,
                      retvals=False,
                      partial_sum=1):

            gfa = GpuFilterActs(module_stride, partial_sum)
            fa = FilterActs(module_stride)

            gpu_images = float32_shared_constructor(images)
            gpu_filters = float32_shared_constructor(filters)
            cpu_images = theano.shared(images)
            cpu_filters = theano.shared(filters)

            gpu_out = gfa(gpu_images, gpu_filters)
            cpu_out = fa(cpu_images, cpu_filters)

            f = theano.function([], [cpu_out, gpu_out])
            cpuval, gpuval = f()
            gpuval = numpy.asarray(gpuval)

            if retvals:
                return cpuval, gpuval
            else:
                #print 'run_match: cpu shape', cpuval.shape
                #print 'run_match: gpu shape', gpuval.shape
                assert cpuval.shape == gpuval.shape
                assert numpy.allclose(cpuval, gpuval)

示例#3

0

显示文件

文件： test_weight_acts_strided.py 项目： AlexArgus/pylearn2

def test_weight_acts_strided():

    # Tests that WeightActs with all possible strides 

    rng = np.random.RandomState([2012,10,9])

    #Each list in shape_list : 
    #[img_shape,filter_shape]
    #[(channels, rows, cols, batch_size),(channels, filter_rows, filter_cols, num_filters)]
    shape_list = [[(1, 7, 8, 5),     (1, 2, 2, 16)],
                  [(3, 7, 8, 5),     (3, 3, 3, 16)],
                  [(16, 11, 11, 4),  (16, 4, 4, 16)], 
                  [(3, 20, 20, 3),   (3, 5, 5, 16)],
                  [(3, 21, 21, 3),   (3, 6, 6, 16)],
                  ]
    for partial_sum in [0, 1, 4]:
        print "partial_sum: %d"%(partial_sum)
        for test_idx in xrange(len(shape_list)):
            images = rng.uniform(-1., 1., shape_list[test_idx][0]).astype('float32')
            filters = rng.uniform(-1., 1., shape_list[test_idx][1]).astype('float32')
            gpu_images = float32_shared_constructor(images,name='images')
            print "test case %d..."%(test_idx+1) 
              
            for ii in xrange(filters.shape[1]):
                stride = ii + 1                            
                output_python = FilterActs_python(images,filters,stride)   
                _, h_rows, h_cols, _ = output_python.shape
                if partial_sum == 4:
                    if (h_rows*h_cols)%partial_sum != 0:
                        print "skip test case %d, stride %d when partial_sum is equal to %d"%(test_idx+1,stride,partial_sum)
                        break
                hidacts = rng.uniform(-1., 1., output_python.shape).astype('float32')
                gpu_hidacts = float32_shared_constructor(hidacts,name='hidacts')
                    
                weights_grad_python = WeightActs_python(images,hidacts,filters.shape[1],filters.shape[2],stride)
                
                weights_grad = WeightActs(partial_sum=partial_sum,stride=stride)(
                                                    gpu_images,
                                                    gpu_hidacts,
                                                    as_tensor_variable((filters.shape[1], filters.shape[2]))
                                                   )[0]
                weights_grad = host_from_gpu(weights_grad)
                f = function([], weights_grad)
                weights_grad_val = f()   
                
                warnings.warn("""test_weight_acts_strided success criterion is not very strict.""")
                
                if np.abs(weights_grad_val - weights_grad_python).max() > 3.4e-5:
                    assert type(weights_grad_val) == type(weights_grad_python)
                    assert weights_grad_val.dtype == weights_grad_python.dtype
                    if weights_grad_val.shape != weights_grad_python.shape:
                        print 'cuda-convnet shape: ',weights_grad_val.shape
                        print 'python conv shape: ',weights_grad_python.shape
                        assert False
                    err = np.abs(weights_grad_val - weights_grad_python)
                    print 'stride %d'%stride
                    print 'absolute error range: ', (err.min(), err.max())
                    print 'mean absolute error: ', err.mean()
                    print 'cuda-convnet value range: ', (weights_grad_val.min(), weights_grad_val.max())
                    print 'python conv value range: ', (weights_grad_python.min(), weights_grad_python.max())

示例#4

0

显示文件

文件： test_weight_acts_strided.py 项目： thomkallor/pylearn2

def test_weight_acts_strided():

    # Tests that WeightActs with all possible strides 

    rng = np.random.RandomState([2012,10,9])

    #Each list in shape_list : 
    #[img_shape,filter_shape]
    #[(channels, rows, cols, batch_size),(channels, filter_rows, filter_cols, num_filters)]
    shape_list = [[(1, 7, 8, 5),     (1, 2, 2, 16)],
                  [(3, 7, 8, 5),     (3, 3, 3, 16)],
                  [(16, 11, 11, 4),  (16, 4, 4, 16)], 
                  [(3, 20, 20, 3),   (3, 5, 5, 16)],
                  [(3, 21, 21, 3),   (3, 6, 6, 16)],
                  ]
    for partial_sum in [0, 1, 4]:
        print("partial_sum: %d"%(partial_sum))
        for test_idx in xrange(len(shape_list)):
            images = rng.uniform(-1., 1., shape_list[test_idx][0]).astype('float32')
            filters = rng.uniform(-1., 1., shape_list[test_idx][1]).astype('float32')
            gpu_images = float32_shared_constructor(images,name='images')
            print("test case %d..."%(test_idx+1))
              
            for ii in xrange(filters.shape[1]):
                stride = ii + 1                            
                output_python = FilterActs_python(images,filters,stride)   
                _, h_rows, h_cols, _ = output_python.shape
                if partial_sum == 4:
                    if (h_rows*h_cols)%partial_sum != 0:
                        print("skip test case %d, stride %d when partial_sum is equal to %d"%(test_idx+1,stride,partial_sum))
                        break
                hidacts = rng.uniform(-1., 1., output_python.shape).astype('float32')
                gpu_hidacts = float32_shared_constructor(hidacts,name='hidacts')
                    
                weights_grad_python = WeightActs_python(images,hidacts,filters.shape[1],filters.shape[2],stride)
                
                weights_grad = WeightActs(partial_sum=partial_sum,stride=stride)(
                                                    gpu_images,
                                                    gpu_hidacts,
                                                    as_tensor_variable((filters.shape[1], filters.shape[2]))
                                                   )[0]
                weights_grad = host_from_gpu(weights_grad)
                f = function([], weights_grad)
                weights_grad_val = f()   
                
                warnings.warn("""test_weight_acts_strided success criterion is not very strict.""")
                
                if np.abs(weights_grad_val - weights_grad_python).max() > 3.4e-5:
                    assert type(weights_grad_val) == type(weights_grad_python)
                    assert weights_grad_val.dtype == weights_grad_python.dtype
                    if weights_grad_val.shape != weights_grad_python.shape:
                        print('cuda-convnet shape: ',weights_grad_val.shape)
                        print('python conv shape: ',weights_grad_python.shape)
                        assert False
                    err = np.abs(weights_grad_val - weights_grad_python)
                    print('stride %d'%stride)
                    print('absolute error range: ', (err.min(), err.max()))
                    print('mean absolute error: ', err.mean())
                    print('cuda-convnet value range: ', (weights_grad_val.min(), weights_grad_val.max()))
                    print('python conv value range: ', (weights_grad_python.min(), weights_grad_python.max()))

示例#5

0

显示文件

文件： test_gpu_unshared_conv.py 项目： AlexArgus/pylearn2

 def setUp(self):
     self.gwa = GpuWeightActs(
             module_stride=self.module_stride,
             partial_sum=self.partial_sum)
     self.gpu_images = float32_shared_constructor(
             numpy.random.rand(*self.ishape).astype(self.dtype))
     self.gpu_hidact = float32_shared_constructor(
             numpy.random.rand(*self.hshape).astype(self.dtype))

示例#6

0

显示文件

文件： test_gpu_unshared_conv.py 项目： AlexArgus/pylearn2

 def setUp(self):
     test_unshared_conv.TestFilterActs.setUp(self)
     self.gpu_op = GpuFilterActs(
             module_stride=self.module_stride,
             partial_sum=1)
     self.s_images = float32_shared_constructor(
             self.s_images.get_value())
     self.s_filters = float32_shared_constructor(
             self.s_filters.get_value())

示例#7

0

显示文件

文件： test_gpu_unshared_conv.py 项目： shayan-taheri/TheanoLinear

 def setUp(self):
     test_unshared_conv.TestFilterActs.setUp(self)
     self.gpu_op = GpuFilterActs(
             module_stride=self.module_stride,
             partial_sum=1)
     self.s_images = float32_shared_constructor(
             self.s_images.get_value())
     self.s_filters = float32_shared_constructor(
             self.s_filters.get_value())

示例#8

0

显示文件

文件： test_image_acts_strided.py 项目： 123fengye741/pylearn2

def test_image_acts_strided():

    # Tests that running FilterActs with all possible strides 

    rng = np.random.RandomState([2012,10,9])

    #Each list in shape_list : 
    #[img_shape,filter_shape]
    #[(channels, rows, cols, batch_size),(channels, filter_rows, filter_cols, num_filters)]
    shape_list = [[(1, 7, 8, 5),     (1, 2, 2, 16)],
                  [(3, 7, 8, 5),     (3, 3, 3, 16)],
                  [(16, 11, 11, 4),  (16, 4, 4, 16)], 
                  [(3, 20, 20, 3),   (3, 5, 5, 16)],
                  [(3, 21, 21, 3),   (3, 6, 6, 16)],
                  ]

    for test_idx in xrange(len(shape_list)):
        images = rng.uniform(-1., 1., shape_list[test_idx][0]).astype('float32')
        filters = rng.uniform(-1., 1., shape_list[test_idx][1]).astype('float32')
        gpu_images = float32_shared_constructor(images,name='images')
        gpu_filters = float32_shared_constructor(filters,name='filters')
        print("test case %d..."%(test_idx+1))
        
        for ii in xrange(filters.shape[1]):
            stride = ii + 1
                   
            output_python = FilterActs_python(images,filters,stride)
            hidacts = rng.uniform(-1., 1., output_python.shape).astype('float32')
            gpu_hidacts = float32_shared_constructor(hidacts,name='hidacts')
            Img_output_python = ImageActs_python(filters,hidacts,stride,(images.shape[1], images.shape[2]))            
            
            Img_output = ImageActs(stride=stride)(gpu_hidacts, gpu_filters, as_tensor_variable((images.shape[1], images.shape[2])))
            Img_output = host_from_gpu(Img_output)
            f = function([], Img_output)
            Img_output_val = f()
            
            warnings.warn("""test_image_acts_strided success criterion is not very strict.""")
            
            if np.abs(Img_output_val - Img_output_python).max() > 2.1e-5:
                assert type(Img_output_val) == type(Img_output_python)
                assert Img_output_val.dtype == Img_output_python.dtype
                if Img_output_val.shape != Img_output_python.shape:
                    print('cuda-convnet shape: ',Img_output_val.shape)
                    print('python conv shape: ',Img_output_python.shape)
                    assert False
                err = np.abs(Img_output_val - Img_output_python)
                print('stride %d'%stride)
                print('absolute error range: ', (err.min(), err.max()))
                print('mean absolute error: ', err.mean())
                print('cuda-convnet value range: ', (Img_output_val.min(), Img_output_val.max()))
                print('python conv value range: ', (Img_output_python.min(), Img_output_python.max()))

示例#9

0

显示文件

文件： test_filter_acts_strided.py 项目： yo-ga/TextDetector

def test_filter_acts_strided():

    # Tests that FilterActs with all possible strides

    rng = np.random.RandomState([2012, 10, 9])

    #Each list in shape_list :
    #[img_shape,filter_shape]
    #[(channels, rows, cols, batch_size),(channels, filter_rows, filter_cols, num_filters)]
    shape_list = [
        [(1, 7, 8, 5), (1, 2, 2, 16)],
        [(3, 7, 8, 5), (3, 3, 3, 16)],
        [(16, 11, 11, 4), (16, 4, 4, 16)],
        [(3, 20, 20, 3), (3, 5, 5, 16)],
        [(3, 21, 21, 3), (3, 6, 6, 16)],
    ]

    for test_idx in xrange(len(shape_list)):
        images = rng.uniform(-1., 1.,
                             shape_list[test_idx][0]).astype('float32')
        filters = rng.uniform(-1., 1.,
                              shape_list[test_idx][1]).astype('float32')
        gpu_images = float32_shared_constructor(images, name='images')
        gpu_filters = float32_shared_constructor(filters, name='filters')
        print("test case %d..." % (test_idx + 1))

        for ii in xrange(filters.shape[1]):
            stride = ii + 1

            output = FilterActs(stride=stride)(gpu_images, gpu_filters)
            output = host_from_gpu(output)
            f = function([], output)
            output_val = f()

            output_python = FilterActs_python(images, filters, stride)

            if np.abs(output_val - output_python).max() > 8.6e-6:
                assert type(output_val) == type(output_python)
                assert output_val.dtype == output_python.dtype
                if output_val.shape != output_python.shape:
                    print('cuda-convnet shape: ', output_val.shape)
                    print('python conv shape: ', output_python.shape)
                    assert False
                err = np.abs(output_val - output_python)
                print('stride %d' % stride)
                print('absolute error range: ', (err.min(), err.max()))
                print('mean absolute error: ', err.mean())
                print('cuda-convnet value range: ',
                      (output_val.min(), output_val.max()))
                print('python conv value range: ',
                      (output_python.min(), output_python.max()))

示例#10

0

显示文件

def test_image_acts_strided():

    # Tests that running FilterActs with all possible strides 

    rng = np.random.RandomState([2012,10,9])

    #Each list in shape_list : 
    #[img_shape,filter_shape]
    #[(channels, rows, cols, batch_size),(channels, filter_rows, filter_cols, num_filters)]
    shape_list = [[(1, 7, 8, 5),     (1, 2, 2, 16)],
                  [(3, 7, 8, 5),     (3, 3, 3, 16)],
                  [(16, 11, 11, 4),  (16, 4, 4, 16)], 
                  [(3, 20, 20, 3),   (3, 5, 5, 16)],
                  [(3, 21, 21, 3),   (3, 6, 6, 16)],
                  ]

    for test_idx in xrange(len(shape_list)):
        images = rng.uniform(-1., 1., shape_list[test_idx][0]).astype('float32')
        filters = rng.uniform(-1., 1., shape_list[test_idx][1]).astype('float32')
        gpu_images = float32_shared_constructor(images,name='images')
        gpu_filters = float32_shared_constructor(filters,name='filters')
        print "test case %d..."%(test_idx+1) 
        
        for ii in xrange(filters.shape[1]):
            stride = ii + 1
                   
            output_python = FilterActs_python(images,filters,stride)
            hidacts = rng.uniform(-1., 1., output_python.shape).astype('float32')
            gpu_hidacts = float32_shared_constructor(hidacts,name='hidacts')
            Img_output_python = ImageActs_python(filters,hidacts,stride,(images.shape[1], images.shape[2]))            
            
            Img_output = ImageActs(stride=stride)(gpu_hidacts, gpu_filters, as_tensor_variable((images.shape[1], images.shape[2])))
            Img_output = host_from_gpu(Img_output)
            f = function([], Img_output)
            Img_output_val = f()
            
            warnings.warn("""test_image_acts_strided success criterion is not very strict.""")
            
            if np.abs(Img_output_val - Img_output_python).max() > 2.1e-5:
                assert type(Img_output_val) == type(Img_output_python)
                assert Img_output_val.dtype == Img_output_python.dtype
                if Img_output_val.shape != Img_output_python.shape:
                    print 'cuda-convnet shape: ',Img_output_val.shape
                    print 'python conv shape: ',Img_output_python.shape
                    assert False
                err = np.abs(Img_output_val - Img_output_python)
                print 'stride %d'%stride
                print 'absolute error range: ', (err.min(), err.max())
                print 'mean absolute error: ', err.mean()
                print 'cuda-convnet value range: ', (Img_output_val.min(), Img_output_val.max())
                print 'python conv value range: ', (Img_output_python.min(), Img_output_python.max())

示例#11

0

显示文件

文件： test_blocksparse.py 项目： wycg1984/Theano

def test_blocksparse_grad_merge():
    b = tensor.fmatrix()
    h = tensor.ftensor3()
    iIdx = tensor.lmatrix()
    oIdx = tensor.lmatrix()

    W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
    W = float32_shared_constructor(W_val)

    o = sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
    gW = theano.grad(o.sum(), W)

    lr = numpy.asarray(0.05, dtype='float32')

    upd = W - lr * gW

    f1 = theano.function([h, iIdx, b, oIdx],
                         updates=[(W, upd)],
                         mode=mode_with_gpu)
    # not running with mode=gpu ensures that the elemwise is not merged in
    f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)])

    f2(h_val, iIdx_val, b_val, oIdx_val)
    W_ref = W.get_value()

    # reset the var
    W.set_value(W_val)
    f1(h_val, iIdx_val, b_val, oIdx_val)
    W_opt = W.get_value()

    utt.assert_allclose(W_ref, W_opt)

示例#12

0

显示文件

文件： test_blocksparse.py 项目： 317070/Theano

def test_blocksparse_grad_merge():
    b = tensor.fmatrix()
    h = tensor.ftensor3()
    iIdx = tensor.lmatrix()
    oIdx = tensor.lmatrix()

    W_val, h_val, iIdx_val, b_val, oIdx_val = blocksparse_data()
    W = float32_shared_constructor(W_val)

    o = sparse_block_gemv_ss(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
    gW = theano.grad(o.sum(), W)

    lr = numpy.asarray(0.05, dtype='float32')

    upd = W - lr * gW

    f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)],
                         mode=mode_with_gpu)
    # not running with mode=gpu ensures that the elemwise is not merged in
    mode = None
    if theano.config.mode == 'FAST_COMPILE':
        mode = theano.compile.mode.get_mode('FAST_RUN')

    f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode)

    f2(h_val, iIdx_val, b_val, oIdx_val)
    W_ref = W.get_value()

    # reset the var
    W.set_value(W_val)
    f1(h_val, iIdx_val, b_val, oIdx_val)
    W_opt = W.get_value()

    utt.assert_allclose(W_ref, W_opt)

示例#13

0

显示文件

文件： test_filter_acts_strided.py 项目： AlexArgus/pylearn2

def test_filter_acts_strided():

    # Tests that FilterActs with all possible strides 

    rng = np.random.RandomState([2012,10,9])

    #Each list in shape_list : 
    #[img_shape,filter_shape]
    #[(channels, rows, cols, batch_size),(channels, filter_rows, filter_cols, num_filters)]
    shape_list = [[(1, 7, 8, 5),     (1, 2, 2, 16)],
                  [(3, 7, 8, 5),     (3, 3, 3, 16)],
                  [(16, 11, 11, 4),  (16, 4, 4, 16)], 
                  [(3, 20, 20, 3),   (3, 5, 5, 16)],
                  [(3, 21, 21, 3),   (3, 6, 6, 16)],
                  ]

    for test_idx in xrange(len(shape_list)):
        images = rng.uniform(-1., 1., shape_list[test_idx][0]).astype('float32')
        filters = rng.uniform(-1., 1., shape_list[test_idx][1]).astype('float32')
        gpu_images = float32_shared_constructor(images,name='images')
        gpu_filters = float32_shared_constructor(filters,name='filters')
        print "test case %d..."%(test_idx+1) 
        
        for ii in xrange(filters.shape[1]):
            stride = ii + 1
            
            output = FilterActs(stride=stride)(gpu_images, gpu_filters)
            output = host_from_gpu(output)
            f = function([], output)
            output_val = f()
        
            output_python = FilterActs_python(images,filters,stride)
                        
            if np.abs(output_val - output_python).max() > 8.6e-6:
                assert type(output_val) == type(output_python)
                assert output_val.dtype == output_python.dtype
                if output_val.shape != output_python.shape:
                    print 'cuda-convnet shape: ',output_val.shape
                    print 'python conv shape: ',output_python.shape
                    assert False
                err = np.abs(output_val - output_python)
                print 'stride %d'%stride
                print 'absolute error range: ', (err.min(), err.max())
                print 'mean absolute error: ', err.mean()
                print 'cuda-convnet value range: ', (output_val.min(), output_val.max())
                print 'python conv value range: ', (output_python.min(), output_python.max())

示例#14

0

显示文件

文件： test_gpu_unshared_conv.py 项目： sdmassey27/pylearn2

        def run_match(self, images, filters, module_stride, retvals=False):

            gfa = GpuFilterActs(module_stride)
            fa = FilterActs(module_stride)

            gpu_images = float32_shared_constructor(images)
            gpu_filters = float32_shared_constructor(filters)
            cpu_images = theano.shared(images)
            cpu_filters = theano.shared(filters)

            gpu_out = gfa(gpu_images, gpu_filters)
            cpu_out = fa(cpu_images, cpu_filters)

            f = theano.function([], [cpu_out, gpu_out])
            cpuval, gpuval = f()
            gpuval = numpy.asarray(gpuval)

            if retvals:
                return cpuval, gpuval
            else:
                # print 'run_match: cpu shape', cpuval.shape
                # print 'run_match: gpu shape', gpuval.shape
                assert cpuval.shape == gpuval.shape
                assert numpy.allclose(cpuval, gpuval)

示例#15

0

显示文件

    def Xtest_blocksparse_grad_merge(self):
        b = tensor.fmatrix()
        h = tensor.ftensor3()
        iIdx = tensor.lmatrix()
        oIdx = tensor.lmatrix()

        W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data()
        W = float32_shared_constructor(W_val)

        o = gpu_sparse_block_gemv(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
        gW = theano.grad(o.sum(), W)

        lr = numpy.asarray(0.05, dtype='float32')

        upd = W - lr * gW

        f1 = theano.function([h, iIdx, b, oIdx],
                             updates=[(W, upd)],
                             mode=mode_with_gpu)

        # Make sure the lr update was merged.
        assert isinstance(f1.maker.fgraph.outputs[0].owner.op,
                          GpuSparseBlockOuter)

        # Exclude the merge optimizations.
        mode = mode_with_gpu.excluding('local_merge_blocksparse_alpha')
        mode = mode.excluding('local_merge_blocksparse_output')

        f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode)

        # Make sure the lr update is not merged.
        assert not isinstance(f2.maker.fgraph.outputs[0].owner.op,
                              GpuSparseBlockOuter)

        f2(h_val, iIdx_val, b_val, oIdx_val)
        W_ref = W.get_value()

        # reset the var
        W.set_value(W_val)
        f1(h_val, iIdx_val, b_val, oIdx_val)
        W_opt = W.get_value()

        utt.assert_allclose(W_ref, W_opt)

示例#16

0

显示文件

文件： test_blocksparse.py 项目： Abioy/Theano

    def Xtest_blocksparse_grad_merge(self):
        b = tensor.fmatrix()
        h = tensor.ftensor3()
        iIdx = tensor.lmatrix()
        oIdx = tensor.lmatrix()

        W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data()
        W = float32_shared_constructor(W_val)

        o = gpu_sparse_block_gemv(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
        gW = theano.grad(o.sum(), W)

        lr = numpy.asarray(0.05, dtype='float32')

        upd = W - lr * gW

        f1 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)],
                             mode=mode_with_gpu)

        # Make sure the lr update was merged.
        assert isinstance(f1.maker.fgraph.outputs[0].owner.op,
                          GpuSparseBlockOuter)

        # Exclude the merge optimizations.
        mode = mode_with_gpu.excluding('local_merge_blocksparse_alpha')
        mode = mode.excluding('local_merge_blocksparse_output')

        f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode)

        # Make sure the lr update is not merged.
        assert not isinstance(f2.maker.fgraph.outputs[0].owner.op,
                              GpuSparseBlockOuter)

        f2(h_val, iIdx_val, b_val, oIdx_val)
        W_ref = W.get_value()

        # reset the var
        W.set_value(W_val)
        f1(h_val, iIdx_val, b_val, oIdx_val)
        W_opt = W.get_value()

        utt.assert_allclose(W_ref, W_opt)