Exemplo n.º 1
0
def test_reshape():

    a = tcn.CudaNdarrayType((False,))()
    b = tcn.CudaNdarrayType((False,False))()
    c = T.reshape(a, [2,3])

    #basic
    f = theano.function([a], c, mode=mode_without_gpu)
    fv = f(cuda_ndarray.CudaNdarray(theano._asarray([0,1,2,3,4,5],dtype='float32')))
    assert numpy.all(fv == numpy.asarray([[0,1,2], [3,4,5]]))

    #test that it works without inplace operations
    a_val = cuda_ndarray.CudaNdarray(theano._asarray([0,1,2,3,4,5],dtype='float32'))
    a_val_copy = cuda_ndarray.CudaNdarray(theano._asarray([0,1,2,3,4,5],dtype='float32'))
    b_val = cuda_ndarray.CudaNdarray(theano._asarray([[0,1,2],[3,4,5]],dtype='float32'))

    f_sub = theano.function([a,b], c-b, mode=mode_without_gpu)
    assert numpy.all(f_sub(a_val, b_val) == 0.0)
    assert numpy.all(numpy.asarray(a_val) == numpy.asarray(a_val_copy))

    #test that it works with inplace operations
    a_val = theano._asarray([0,1,2,3,4,5], dtype='float32')
    a_val_copy = theano._asarray([0,1,2,3,4,5], dtype='float32')
    b_val = theano._asarray([[0,1,2],[3,4,5]], dtype='float32')

    f_sub = theano.function([a,b], c-b, mode=mode_without_gpu)
    assert numpy.all(f_sub(a_val, b_val) == 0.0)
    assert numpy.all(numpy.asarray(a_val) == numpy.asarray(a_val_copy))

    # verify gradient
    def just_vals(v):
        return T.Reshape(2)(v, theano._asarray([2,3], dtype='int32'))
    utt.verify_grad(just_vals, [a_val])
Exemplo n.º 2
0
def conv_grad(mode, bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsample, op):
    ishape = (bs, ch, rImg1, rImg2)
    kshape = (nf, ch, rFlt1, rFlt2)

    npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
    npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')

    i = cuda.CudaNdarrayType(broadcastable=[sh == 1 for sh in npy_img.shape])()
    k = cuda.CudaNdarrayType(broadcastable=[sh == 1
                                            for sh in npy_kern.shape])()

    # TODO: also test custom pad values
    corr_op = op(mode, subsample)(i, k)
    # try to compile reference implementation without shape,
    # so we don't have to compile hundreds of versions
    conv_op = tensor.nnet.conv2d(i,
                                 k[:, :, ::-1, ::-1],
                                 border_mode=mode,
                                 subsample=subsample)
    try:
        conv_op_di = theano.grad(conv_op.sum(), i)
        conv_op_dk = theano.grad(conv_op.sum(), k)
    except Exception:
        # compile with shape information only when needed
        conv_op = tensor.nnet.conv2d(i, k[:, :, ::-1, ::-1], ishape, kshape,
                                     mode, subsample)
    conv_op_di = theano.grad(conv_op.sum(), i)
    conv_op_dk = theano.grad(conv_op.sum(), k)
    corr_op_di = theano.grad(corr_op.sum(), i)
    corr_op_dk = theano.grad(corr_op.sum(), k)
    outputs = [
        corr_op, conv_op, corr_op_di, conv_op_di, corr_op_dk, conv_op_dk
    ]
    try:
        conv_op_dik = theano.grad(conv_op_di.sum(), k)
        conv_op_dki = theano.grad(conv_op_dk.sum(), i)
        corr_op_dik = theano.grad(corr_op_di.sum(), k)
        corr_op_dki = theano.grad(corr_op_dk.sum(), i)
        outputs.extend([corr_op_dik, conv_op_dik, corr_op_dki, conv_op_dki])
    except Exception:
        # skip if the reference implementation can't do it
        pass

    f = theano.function([i, k],
                        outputs,
                        mode=theano_mode.excluding('conv_dnn', 'conv_gemm'))

    allvals = f(npy_img, npy_kern)

    for a, b, oa, ob, p in zip(
            allvals[::2], allvals[1::2], outputs[::2], outputs[1::2],
        ('top', 'dtop/dbottom', 'dtop/dweight', 'dtop/dbottom/dweight',
         'dtop/dweight/dbottom')):
        assert oa.type.broadcastable[:2] == ob.type.broadcastable[:2]

        assert_allclose(a, b, rtol=1e-4)
Exemplo n.º 3
0
def conv_grad(mode, bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsample, op):
    ishape = (bs, ch, rImg1, rImg2)
    kshape = (nf, ch, rFlt1, rFlt2)

    npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
    npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')

    i = cuda.CudaNdarrayType(broadcastable=[sh == 1 for sh in npy_img.shape])()
    k = cuda.CudaNdarrayType(broadcastable=[sh == 1
                                            for sh in npy_kern.shape])()

    # TODO: also test custom pad values
    corr_op = op(mode, subsample)(i, k)
    conv_op = tensor.nnet.conv2d(i,
                                 k[:, :, ::-1, ::-1],
                                 border_mode=mode,
                                 subsample=subsample)
    conv_op_di = theano.grad(conv_op.sum(), i)
    conv_op_dk = theano.grad(conv_op.sum(), k)
    corr_op_di = theano.grad(corr_op.sum(), i)
    corr_op_dk = theano.grad(corr_op.sum(), k)
    outputs = [
        corr_op, conv_op, corr_op_di, conv_op_di, corr_op_dk, conv_op_dk
    ]

    conv_op_dik = theano.grad(conv_op_di.sum(), k)
    conv_op_dki = theano.grad(conv_op_dk.sum(), i)
    corr_op_dik = theano.grad(corr_op_di.sum(), k)
    corr_op_dki = theano.grad(corr_op_dk.sum(), i)
    outputs.extend([corr_op_dik, conv_op_dik, corr_op_dki, conv_op_dki])

    if not theano.config.blas.ldflags:
        # Some of the operations are not transferred to the GPU,
        # and withoug BLAS, the abstract Op will not be optimized
        # to CorrMM either, so we have to accept the use of the
        # slow Python convolution in that case.
        mode = theano_mode.excluding('AbstractConvCheck')
    else:
        mode = theano_mode

    f = theano.function([i, k], outputs, mode=mode)

    allvals = f(npy_img, npy_kern)

    for a, b, oa, ob, p in zip(
            allvals[::2], allvals[1::2], outputs[::2], outputs[1::2],
        ('top', 'dtop/dbottom', 'dtop/dweight', 'dtop/dbottom/dweight',
         'dtop/dweight/dbottom')):
        assert oa.type.broadcastable[:2] == ob.type.broadcastable[:2]

        assert_allclose(a, b, rtol=1e-4)
Exemplo n.º 4
0
def test_transfer_cuda_gpu():
    import theano.sandbox.cuda as cuda_ndarray
    if cuda_ndarray.cuda_available is False:
        raise SkipTest("Can't test interaction with cuda if cuda not present")
    g = GpuArrayType(dtype='float32', broadcastable=(False, False))('g')
    c = cuda_ndarray.CudaNdarrayType((False, False))('c')

    av = theano._asarray(rng.rand(5, 4), dtype='float32')
    gv = gpuarray.array(av)
    cv = cuda_ndarray.CudaNdarray(av)
    gvs = gv[:, ::-2]
    cvs = cv[:, ::-2]

    f = theano.function([c], gpu_from_cuda(c))
    fv = f(cv)
    assert GpuArrayType.values_eq_approx(fv, gv)

    fvs = f(cvs)
    assert GpuArrayType.values_eq_approx(fvs, gvs)

    f = theano.function([g], cuda_from_gpu(g))
    fv = f(gv)
    assert cuda_ndarray.CudaNdarrayType.values_eq_approx(fv, cv)

    fvs = f(gvs)
    assert cuda_ndarray.CudaNdarrayType.values_eq_approx(fvs, cvs)
Exemplo n.º 5
0
def conv_grad(mode, bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsample, op):
    ishape = (bs, ch, rImg1, rImg2)
    kshape = (nf, ch, rFlt1, rFlt2)

    npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
    npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')

    i = cuda.CudaNdarrayType(broadcastable=[sh == 1 for sh in npy_img.shape])()
    k = cuda.CudaNdarrayType(broadcastable=[sh == 1
                                            for sh in npy_kern.shape])()

    # TODO: also test custom pad values
    corr_op = op(mode, subsample)(i, k)
    conv_op = tensor.nnet.conv2d(i,
                                 k[:, :, ::-1, ::-1],
                                 border_mode=mode,
                                 subsample=subsample)
    conv_op_di = theano.grad(conv_op.sum(), i)
    conv_op_dk = theano.grad(conv_op.sum(), k)
    corr_op_di = theano.grad(corr_op.sum(), i)
    corr_op_dk = theano.grad(corr_op.sum(), k)
    outputs = [
        corr_op, conv_op, corr_op_di, conv_op_di, corr_op_dk, conv_op_dk
    ]

    conv_op_dik = theano.grad(conv_op_di.sum(), k)
    conv_op_dki = theano.grad(conv_op_dk.sum(), i)
    corr_op_dik = theano.grad(corr_op_di.sum(), k)
    corr_op_dki = theano.grad(corr_op_dk.sum(), i)
    outputs.extend([corr_op_dik, conv_op_dik, corr_op_dki, conv_op_dki])

    # TODO: fix when the abstractconv tests can pass debug mode.
    mode = theano_mode
    if theano.config.mode == 'DEBUG_MODE':
        mode = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
    f = theano.function([i, k], outputs, mode=mode)

    allvals = f(npy_img, npy_kern)

    for a, b, oa, ob, p in zip(
            allvals[::2], allvals[1::2], outputs[::2], outputs[1::2],
        ('top', 'dtop/dbottom', 'dtop/dweight', 'dtop/dbottom/dweight',
         'dtop/dweight/dbottom')):
        assert oa.type.broadcastable[:2] == ob.type.broadcastable[:2]

        assert_allclose(a, b, rtol=1e-4)
Exemplo n.º 6
0
def cpu_var_to_gpu_var(x):
    from theano.sandbox import cuda
    type = cuda.CudaNdarrayType(broadcastable=x.broadcastable)
    name = 'gpu_%s' % x.name
    name = None
    gpu_var = cuda.CudaNdarrayVariable(type=type, name=name)
    cpu_var = cuda.host_from_gpu(gpu_var)
    return gpu_var, cpu_var
    return cuda.host_from_gpu(cuda.CudaNdarrayVariable(type=type, name=name))
Exemplo n.º 7
0
def SharedThunkLSTMFunc(rst, inpshape, oupshape, noot, backwards, MOMENTUM,
                        LEARN_RATE):
    print "MAKE INNER FUNCTION", inpshape, oupshape, noot, backwards
    cuda2d = cuda.CudaNdarrayType((False, False))  #T.fmatrix
    isym = SymbolLayer(cuda2d(), (100, inpshape))
    oval, l1f = BlockLSTMUnrollArrayToArray(rst,
                                            isym,
                                            oupshape,
                                            noot=noot,
                                            backwards=backwards)
    oflag = cuda2d()

    oupfunc = theano.function([isym.output],
                              cuda.basic_ops.as_cuda_ndarray_variable(
                                  oval.output))
    infshape = lambda x0: (x0[0][0], oupshape)
    #GRADFUNC
    g = T.sum(oval.output * oflag)
    iglist = T.grad(g, [isym.output] + l1f.params)
    olist = iglist[0]
    glist = iglist[1:]
    #Generate MOMENTUM
    mom = []
    for i in l1f.params:
        init = np.zeros_like(i.get_value())
        mom.append(theano.shared(init, name=i.name + '_momentum_ct'))
    #Additive update
    updates = []
    for i, j in zip(glist, mom):
        updates.append((j, j - i * LEARN_RATE))
    momup = []
    for i in mom:
        momup.append((i, i * MOMENTUM))
    print "MAIN UPDATES", updates
    resetmom = theano.function([], [], updates=momup)
    getgrad = theano.function([isym.output, oflag],
                              cuda.basic_ops.as_cuda_ndarray_variable(olist),
                              updates=updates)
    sharedop = GPUSharedThunkOp(oupfunc, infshape, getgrad)

    #Make sharedlayer
    class SharedLayer(Layer, Param, CMomentum):
        def get_momentums(self):
            return []

        def __init__(self, inp, paramroot=False):
            if paramroot:
                self.params = l1f.params
                self.get_momentums = lambda: mom
            else:
                self.params = []
            self.output = sharedop(inp.output)
            self.output_shape = oval.output_shape

    return SharedLayer, resetmom
Exemplo n.º 8
0
def gemm_directly(bs, ch, nf, rImg1, rImg2, rFlt1, rFlt2, subsx, subsy,
                  direction):
    ishape = (bs, ch, rImg1, rImg2)
    kshape = (nf, ch, rFlt1, rFlt2)
    subsample = (subsx, subsy)

    npy_img = theano._asarray(numpy.random.rand(*ishape), dtype='float32')
    npy_kern = theano._asarray(numpy.random.rand(*kshape), dtype='float32')

    i = cuda.CudaNdarrayType(
        broadcastable=[sh == 1 for sh in npy_img.shape])()
    k = cuda.CudaNdarrayType(
        broadcastable=[sh == 1 for sh in npy_kern.shape])()

    if direction == 'fprop':
        cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
        op = theano.sandbox.cuda.blas.GpuCorrMM(border_mode='valid',
                                                subsample=subsample)(i, k)
        f = theano.function([i, k], op, mode=theano_mode)
        gpuval = f(npy_img, npy_kern[:,:,::-1,::-1])
    elif direction == 'bprop img':
        cpuval = py_conv(npy_img, npy_kern, 'full', subsample)
        op = theano.sandbox.cuda.blas.GpuCorrMM_gradInputs(
            border_mode='valid', subsample=subsample)(i, k)
        f = theano.function([i, k], op, mode=theano_mode)
        gpuval = f(npy_kern.transpose(1, 0, 2, 3), npy_img)
    elif direction == 'bprop kern':
        cpuval = py_conv(npy_img, npy_kern, 'valid', subsample)
        op = theano.sandbox.cuda.blas.GpuCorrMM_gradWeights(
            border_mode='valid', subsample=subsample)(i, k)
        f = theano.function([i, k], op, mode=theano_mode)
        gpuval = numpy.array(f(
                npy_img.transpose(1, 0, 2, 3),
                npy_kern.transpose(1, 0, 2, 3)[:,:,::-1,::-1])).transpose(
            1, 0, 2, 3)

    assert_allclose(cpuval, gpuval, rtol=1e-4)
Exemplo n.º 9
0
        def make_node(self, img):
            assert hasattr(self, '_props'), "Your version of theano is too old " \
                "to support __props__."
            # Theano's CudaNdArray support strides. But this require writing C
            # code calling the functions of sandbox/cuda/cuda_ndarray.cuh
            # and passing all the strides to the kernel to do the correct
            # computation. Instead, enforce contiguous arrays.
            cu_img = cuda.basic_ops.gpu_contiguous(
                cuda.basic_ops.as_cuda_ndarray_variable(img))
            assert cu_img.dtype == 'float32'

            # N x nchannels x nbins
            output = cuda.CudaNdarrayType(
                dtype='float32',
                broadcastable=[False, False, False])()
            return theano.Apply(self, [cu_img], [output])
Exemplo n.º 10
0
    def make_node(self, img, kern):
        img = cuda.basic_ops.gpu_contiguous(
            cuda.basic_ops.as_cuda_ndarray_variable(img))
        kern = cuda.basic_ops.gpu_contiguous(
            cuda.basic_ops.as_cuda_ndarray_variable(kern))

        if img.type.ndim != 5:
            raise TypeError('img must be 5D tensor')
        if kern.type.ndim != 5:
            raise TypeError('kern must be 5D tensor')

        broadcastable = [
            kern.type.broadcastable[-1], False, False, False,
            img.type.broadcastable[-1]
        ]
        return theano.Apply(self, [img, kern],
                            [cuda.CudaNdarrayType(broadcastable)()])
Exemplo n.º 11
0
    def make_node(self, img, topgrad, shape):
        img = cuda.basic_ops.as_cuda_ndarray_variable(img)
        topgrad = cuda.basic_ops.as_cuda_ndarray_variable(topgrad)

        if img.type.ndim != 5:
            raise TypeError('img must be 5D tensor')
        if topgrad.type.ndim != 5:
            raise TypeError('topgrad must be 5D tensor')

        depth_height_width = [shape[0], shape[1], shape[2]]

        broadcastable = [
            img.type.broadcastable[0], False, False, False,
            topgrad.type.broadcastable[0]
        ]
        return theano.Apply(self, [img, topgrad] + depth_height_width,
                            [cuda.CudaNdarrayType(broadcastable)()])
Exemplo n.º 12
0
def test_elemwise_collapse6():
    """ Test when all inputs have two broadcastable dimension at the
    beginning"""

    shape = (4, 5)
    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
                                                 dtype='float32'))
    a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
    a3 = a2.dimshuffle('x', 'x', 0, 1)
    b = tcn.CudaNdarrayType((True, True, False, False))()
    f = pfunc([b], [a3 + b], mode=mode_with_gpu)

    v = theano._asarray(numpy.random.rand(1, 1, shape[0], shape[1]),
                        dtype='float32')
    v = cuda_ndarray.CudaNdarray(v)
    #let debugmode catch errors
    out = f(v)[0]
    assert numpy.allclose(out, a.reshape(1, 1, shape[0], shape[1]) + v)
Exemplo n.º 13
0
def test_elemwise_collapse2():
    """ Test when only one inputs have one broadcastable dimension """

    shape = (4, 5, 9)
    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
                                                 dtype='float32'))
    a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
    a3 = a2.dimshuffle(0, 'x', 1, 2)
    b = tcn.CudaNdarrayType((False, False, False, False))()
    c = a3 + b
    f = pfunc([b], [c], mode=mode_with_gpu)

    v = theano._asarray(numpy.random.rand(shape[0], 5, *shape[1:]),
                        dtype='float32')
    v = cuda_ndarray.CudaNdarray(v)
    #let debugmode catch errors
    out = f(v)[0]
    assert numpy.allclose(out, a.reshape(shape[0], 1, *shape[1:]) + v)
Exemplo n.º 14
0
def test_elemwise_collapse4():
    """ Test when only one inputs have two broadcastable dimension at
    each ends and we add a scalar"""

    shape = (4, 5)
    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
                                                 dtype='float32'))
    a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
    a3 = a2.dimshuffle('x', 0, 1, 'x')
    b = tcn.CudaNdarrayType((False, False, False, False))()
    c = (a3 + b + 2)
    f = pfunc([b], [c], mode=mode_with_gpu)

    v = theano._asarray(numpy.random.rand(5, shape[0], shape[1], 4),
                        dtype='float32')
    v = cuda_ndarray.CudaNdarray(v)
    #let debugmode catch errors
    out = f(v)[0]
    assert numpy.allclose(out, a.reshape(1, shape[0], shape[1], 1) + v + 2)
Exemplo n.º 15
0
def speed_elemwise_collapse():
    """ used to time if the collapse of ccontiguous dims are useful """

    shape = (30, 40, 50, 600)
    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),
                                                 dtype='float32'))
    a = theano._asarray(numpy.random.rand(*shape), dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
    a3 = a2[:, ::2, :, :]
    b = tcn.CudaNdarrayType((False, False, False, False))()
    c = a3 + b * tensor.exp(1 + b ** a3)
    f = pfunc([b], [c], mode=mode_with_gpu)

    v = theano._asarray(numpy.random.rand(*shape), dtype='float32')
    v = v[:, ::2, :, :]
    v = cuda_ndarray.CudaNdarray(v)
    t1 = time.time()
    for i in range(100):
        #let debugmode catch errors
        f(v)
    t2 = time.time()
Exemplo n.º 16
0
def test_elemwise_collapse():
    """ Test when all inputs have one(and the same) broadcastable dimension """

    shape = (4,5,60)
    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32'))
    a = theano._asarray(numpy.random.rand(*shape),dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
    a3 = a2.dimshuffle(0,'x',1,2)
    b = tcn.CudaNdarrayType((False, True, False, False))()
    c = a3+b
    f = pfunc([b], [c], mode=mode_with_gpu)


    v = theano._asarray(numpy.random.rand(shape[0],1,*shape[1:]),dtype='float32')
    v=cuda_ndarray.CudaNdarray(v)
    if False:
        for id,n in enumerate(f.maker.env.toposort()):
            print id, n
    #let debugmode catch errors
    out=f(v)[0]
    assert numpy.allclose(out,a.reshape(shape[0],1,*shape[1:])+v)
    print "Expected collapse of all dimensions"
Exemplo n.º 17
0
def test_elemwise_collapse5():
    """ Test when only one inputs have two broadcastable dimension at the beginning and we add a scalar"""

    shape = (4,5)
    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32'))
    a = theano._asarray(numpy.random.rand(*shape),dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
    a3 = a2.dimshuffle('x','x',0,1)
    b = tcn.CudaNdarrayType((False, False, False, False))()
    c = (a3+b+2)
    f = pfunc([b], [c], mode=mode_with_gpu)


    v = theano._asarray(numpy.random.rand(5,4,shape[0],shape[1]),dtype='float32')
    v=cuda_ndarray.CudaNdarray(v)
    if False:
        for id,n in enumerate(f.maker.env.toposort()):
            print id, n
    #let debugmode catch errors
    out=f(v)[0]
    assert numpy.allclose(out,a.reshape(1,1,shape[0],shape[1])+v+2)
    print "Expected collapse to 2 dimensions"
Exemplo n.º 18
0
def speed_elemwise_collapse2():
    """ used to test the speed up of the generalised collapse of ccontiguous dims"""

    shape = (30,40,50,600)
    a = cuda_ndarray.CudaNdarray(theano._asarray(numpy.random.rand(*shape),dtype='float32'))
    a = theano._asarray(numpy.random.rand(*shape),dtype='float32')
    a2 = tcn.shared_constructor(a, 'a')
    a3 = a2[:,:,:,::2]
    b = tcn.CudaNdarrayType((False, False, False, False))()
    c = a3+b * tensor.exp(1 + b**a3)
    f = pfunc([b], [c], mode=mode_with_gpu)


    v = theano._asarray(numpy.random.rand(*shape),dtype='float32')
    v = v[:,:,:,::2]
    v=cuda_ndarray.CudaNdarray(v)
    for id,n in enumerate(f.maker.env.toposort()):
        print id, n
    t1=time.time()
    for i in range(100):
        #let debugmode catch errors
        f(v)
    t2=time.time()
Exemplo n.º 19
0
            #test with broadcast
        for shape, pattern in [((5,),[0]),
                               ((5,4),[0,1]),((5,4),[0]),
                               ((5,4,3),[0]),((5,4,3),[0,1]),
                               ((5,4,3),[2]),((5,4,3),[0,1,2]),
                               ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
            op = careduce_op(scalar_op, axis=pattern)
            pat = tensor_pattern_to_gpu_pattern(shape, pattern)
            #GpuCAReduce{maximum} support only those patterns
            if scalar_op is theano.scalar.maximum and pat not in [
                (0, 1), (0, 1, 1), (0, 1, 1)]:
                continue
            shape = numpy.asarray(shape) * 2
            a = tensor.TensorType('float32', (False,) * len(shape))()
            a2 = tcn.CudaNdarrayType((False,) * len(shape))()
            b = op(a)
            b2 = op(a2)
            val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
    #        val = numpy.ones(shape)
    #        val = numpy.arange(numpy.prod(shape)).reshape(shape)
            val = theano._asarray(val, dtype='float32')
            val2 = cuda.CudaNdarray(val)
            if len(shape) == 1:
                val = val[::2]
                val2 = val2[::2]
            elif len(shape) == 2:
                val = val[::2, ::2]
                val2 = val2[::2, ::2]
            elif len(shape) == 3:
                val = val[::2, ::2, ::2]
Exemplo n.º 20
0
def _params_allgood(ishape,
                    kshape,
                    mode,
                    subsample=(1, 1),
                    img_stride=(1, 1),
                    kern_stride=(1, 1),
                    version=-1,
                    verbose=0,
                    random=True,
                    print_=None,
                    id=None,
                    rtol=1e-5,
                    atol=1e-8,
                    nb_iter=0,
                    ones=False,
                    compile_kshp=None,
                    theano_mode=None,
                    cls=None):
    #
    # This function is the core of several of the big unit-test drivers,
    # but it can also be used very directly on its own to test a specific
    # kind of convolution.
    #
    # See `test_example` (above) for an example of how to use this directly.
    #
    # :param kshape: (4d)The shape of the kernel at run time.
    # :param compile_kshp: (2d) hardcode the shape of the kernel in
    #                      the generated code This is supposed to be
    #                      faster, but we need to check That we raise
    #                      an error if the input have the wrong shape.
    #
    if ones:
        assert not random
        npy_img = theano._asarray(numpy.ones(ishape), dtype='float32')
        npy_kern = -theano._asarray(numpy.ones(kshape), dtype='float32')
    elif random:
        npy_img = theano._asarray(numpy.random.rand(*ishape) + 1,
                                  dtype='float32')
        npy_kern = theano._asarray(numpy.random.rand(*kshape) - 2,
                                   dtype='float32')
    else:
        npy_img = theano._asarray(numpy.arange(
            numpy.prod(ishape)).reshape(ishape),
                                  dtype='float32') + 1
        npy_kern = -(
            theano._asarray(numpy.arange(numpy.prod(kshape)).reshape(kshape),
                            dtype='float32') + 1)

    img = cuda_ndarray.CudaNdarray(npy_img)
    kern = cuda_ndarray.CudaNdarray(npy_kern)

    # we take the stride after the transfert as we make c_contiguous
    # data on the GPU.
    if img_stride != (1, 1):
        img = img[:, :, ::img_stride[0], ::img_stride[1]]
        npy_img = npy_img[:, :, ::img_stride[0], ::img_stride[1]]
    if kern_stride != (1, 1):
        kern = kern[:, :, ::kern_stride[0], ::kern_stride[1]]
        npy_kern = npy_kern[:, :, ::kern_stride[0], ::kern_stride[1]]

    i = cuda.CudaNdarrayType(broadcastable=[sh == 1 for sh in npy_img.shape])()
    k = cuda.CudaNdarrayType(broadcastable=[sh == 1
                                            for sh in npy_kern.shape])()
    op = theano.sandbox.cuda.blas.GpuConv(border_mode=mode,
                                          subsample=subsample,
                                          version=version,
                                          verbose=verbose,
                                          kshp=compile_kshp)(i, k)
    f = theano.function([i, k], op, mode=theano_mode)
    if cls is not None:
        assert any([
            isinstance(node.op, cls) for node in f.maker.fgraph.toposort()
        ]), "Cannot find class %r in %r" % (cls, f.maker.fgraph.toposort())
    t2 = time.time()
    gpuval = f(img, kern)
    t3 = time.time()
    for i in range(nb_iter):
        gpuval2 = f(img, kern)
        assert (numpy.asarray(gpuval) == numpy.asarray(gpuval2)).all()
    gpuval = numpy.asarray(gpuval)

    # CPU val computed after GPU val to get the GPU errors.
    t0 = time.time()
    cpuval = py_conv(npy_img, npy_kern, mode, subsample)
    t1 = time.time()

    assert gpuval.shape == cpuval.shape, ("shape mismatch", gpuval.shape,
                                          cpuval.shape)
    assert_allclose(cpuval, gpuval, rtol=rtol, atol=atol)
    assert numpy.all(numpy.isfinite(gpuval)), gpuval
    assert [(sh == 1) is br
            for sh, br in zip(cpuval.shape[:2], op.type.broadcastable[:2])]

    if (t2 is not None):
        if mode == 'valid':
            approx_fp = cpuval.size * ishape[1] * kshape[2] * kshape[3] * 2
        else:
            approx_fp = (ishape[0] * kshape[0] * kshape[1] * kshape[2] *
                         kshape[3] * ishape[2] * ishape[3] * 2)
        approx_fp /= 1e6
        cpu_mflops = approx_fp / (t1 - t0)
        gpu_mflops = approx_fp / (t3 - t2)
        if verbose > 0:
            print('%15s' % str(ishape),
                  '%15s' % str(kshape),
                  end=' ',
                  file=sys.stdout)
            print('%12.5f  %7.2f %7.2f %7.1f' %
                  (approx_fp, cpu_mflops, gpu_mflops, (t1 - t0) / (t2 - t1)),
                  file=sys.stdout)
Exemplo n.º 21
0
 def output_type(self, inp):
     return cuda.CudaNdarrayType(broadcastable=[False] * (inp.type.ndim))
Exemplo n.º 22
0
 def output_type(self, inp):
     return cuda.CudaNdarrayType(broadcastable=[False, False])
Exemplo n.º 23
0
def cpu_to_gpu_var(x):
    type = cuda.CudaNdarrayType(broadcastable=x.broadcastable)
    name = gpu_name(x.name)
    gpu_var = cuda.CudaNdarrayVariable(type=type, name=name)
    cpu_var = cuda.host_from_gpu(gpu_var)
    return gpu_var, cpu_var
Exemplo n.º 24
0
from theano import tensor
from theano.gof.python25 import any
from theano.tests.unittest_tools import seed_rng

# Skip test if cuda_ndarray is not available.
import theano.sandbox.cuda as cuda_ndarray
if cuda_ndarray.cuda_available == False:
    raise SkipTest('Optional package cuda disabled')

#needed as the gpu conv don't have a perform implementation.
if theano.config.mode == 'FAST_COMPILE':
    theano_mode = theano.compile.mode.get_mode('FAST_RUN').including('gpu')
else:
    theano_mode = theano.compile.mode.get_default_mode().including('gpu')

cuda_tensor4 = cuda_ndarray.CudaNdarrayType([False] * 4)

device_id = theano.sandbox.cuda.use.device_number
if device_id is None:
    cuda_ndarray.shared_constructor(numpy.zeros(2, dtype='float32'))
device_id = theano.sandbox.cuda.use.device_number
if device_id is None:
    cuda.use("gpu",
             force=False,
             default_to_move_computation_to_gpu=False,
             move_shared_float32_to_gpu=False,
             enable_cuda=False,
             test_driver=True)
    device_id = theano.sandbox.cuda.use.device_number
cuda_ndarray = theano.sandbox.cuda.cuda_ndarray.cuda_ndarray
device_prop = cuda_ndarray.device_properties(device_id)
Exemplo n.º 25
0
 def output_type(self, inp):
     return cuda.CudaNdarrayType(
         broadcastable=[False] *
         (inp.type.ndim + 1))  # add one extra dim for real/imag
Exemplo n.º 26
0
 def output_type(self, inp):
     return cuda.CudaNdarrayType(
         broadcastable=[False] *
         (inp.type.ndim - 1))  # remove extra real/imag dim
Exemplo n.º 27
0
def test_sum():
    """
    test sum pattern 1, 11, 10, 01, 100, 110, 011, 001, 111, 0011, 0101, 0111, 1011, 1111

    test sum pattern implemented with reshape:
    1000, 0100, 0010, 0001, 11111

    others implemented by reshape that are not tested
    0011,0101,0110,1001,1010,1100
    1110,1101,1011

    TODO: test with broadcast
    """
    for shape, pattern in [((100,3,1300),[1]),
                           ((0,),[0]),((5,),[0]),
                           ((0,0),[0,1]),((1,0),[0,1]),((5,4),[0,1]),((33,31),[0,1]),((5,4),[1]),((5,4),[0]),#need something bigger then 32 for some opt test.
                           ((5,4,3),[0]),((5,4,3),[1]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[1,2]),((5,4,3),[0,1,2]),
                           ((0,0,0,0),[0,1,2,3]),
                           ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),
                           ((5,4,3,10,11),[1,2]),
                           ((5,4,3,20),[2,3]), ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3]),((5,4,3,2),[1,2,3]),

                           #test shape bigger then 4096 on each dimension to make sure that we work correctly when we don't have enought thread/block in each dimensions
                           ((4100,3),[0]),((3,4101),[0]),#10
                           ((1024,33),[0]),((33,1024),[0]),#10
                           ((1025,33),[0]),((33,1025),[0]),#10

                           ((4100,3),[1]),((3,4101),[1]),#01
                           ((1024,33),[1]),((33,1024),[1]),#01
                           ((1025,33),[1]),((33,1025),[1]),#01

                           ((4100,3),[0,1]),((3,4101),[0,1]),#11
                           ((1024,33),[0,1]),((33,1024),[0,1]),#01
                           ((1025,33),[0,1]),((33,1025),[0,1]),#01

                           ((4100,4,3),[0]),((5,4100,3),[0]),((5,4,4100),[0]),#100
                           ((4100,4,3),[1]),((5,4100,3),[1]),((5,4,4100),[1]),#010
                           ((4100,4,3),[2]),((5,4100,3),[2]),((5,4,4100),[2]),#001
                           ((4100,4,3),[0,1]),((5,4100,3),[0,1]),((5,4,4100),[0,1]),#110
                           ((4100,4,3),[1,2]),((5,4100,3),[1,2]),((5,4,4100),[1,2]),#011
                           #((4100,4,3),[0,2]),((5,4100,3),[0,2]),((5,4,4100),[0,2]),#101 ##not implemented
                           ((4100,4,3),[0,1,2]),((5,4100,3),[0,1,2]),((5,4,4100),[0,1,2]),#111

                           ((4100,4,3,2),[2,3]),((4,4100,3,2),[2,3]),((4,3,4100,2),[2,3]),((4,3,2,4100),[2,3]),#0011
                           ((4100,4,3,2),[1,3]),((4,4100,3,2),[1,3]),((4,3,4100,2),[1,3]),((4,3,2,4100),[1,3]),#0101
                           ((4100,4,3,2),[0,2,3]),((4,4100,3,2),[0,2,3]),((4,3,4100,2),[0,2,3]),#((4,3,2,4100),[0,2,3]),#1011
                           ((4100,4,3,2),[1,2,3]),((4,4100,3,2),[1,2,3]),((4,3,4100,2),[1,2,3]),((4,3,2,4100),[1,2,3]),#0111
                           ((4100,2,3,4),[0,1,2,3]),((2,4100,3,4),[0,1,2,3]),((2,3,4100,4),[0,1,2,3]),((2,3,4,4100),[0,1,2,3]),#1111


                           #test pattern implemented by reshape
                           ((4100,4,3,2),[0]),((4,4100,3,2),[0]),((4,3,4100,2),[0]),((4,3,2,4100),[0]),#1000
                           ((4100,4,3,2),[1]),((4,4100,3,2),[1]),((4,3,4100,2),[1]),((4,3,2,4100),[1]),#0100
                           ((4100,4,3,2),[2]),((4,4100,3,2),[2]),((4,3,4100,2),[2]),((4,3,2,4100),[2]),#0010
                           ((4100,4,3,2),[3]),((4,4100,3,2),[3]),((4,3,4100,2),[3]),((4,3,2,4100),[3]),#0001
                           ((1100,2,3,4,5),[0,1,2,3,4]),((2,1100,3,4,5),[0,1,2,3,4]),((2,3,1100,4,5),[0,1,2,3,4]),((2,3,4,1100,5),[0,1,2,3,4]),((2,3,4,5,1100),[0,1,2,3,4]),#11111

                           ]:
        a = tensor.TensorType('float32',(False,)*len(shape))()
        b = T.Sum(pattern)(a)
        val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
#        val = numpy.ones(shape)
#        val = numpy.arange(numpy.prod(shape)).reshape(shape)
        val = theano._asarray(val,dtype='float32')
        f = theano.function([a],b, mode=mode_with_gpu)
        f2 = theano.function([a],b, mode=mode_without_gpu)
        assert tcn.GpuSum in [x.op.__class__ for x in f.maker.env.toposort()]
        assert T.Sum in [x.op.__class__ for x in f2.maker.env.toposort()]
        if val.size==0:
            assert f2(val)==f(val), ('shape', shape, 'pattern', pattern)
        else:
            try:
                #We raise the error threashold as we sum big matrix
                #and this cause small rounding difference with some seed
                #example in debug mode with unittests.rseed=9275
                orig_rtol = theano.tensor.basic.float32_rtol
                theano.tensor.basic.float32_rtol = 2e-5
                assert _allclose(f2(val),f(val)), ('shape', shape, 'pattern', pattern, sum([shape[i] for i in pattern]))
            finally:
                theano.tensor.basic.float32_rtol = orig_rtol


        #test with dimshuffle
        #we shuffle the 2 outer dims.
    for shape, pattern in [#((5,),[0]),
                           ((5,4),[0,1]),((5,4),[0]),
                           ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
                           ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
        a = tensor.TensorType('float32',(False,)*len(shape))()
        dim_pattern = range(len(shape))
        dim_pattern[0]=1
        dim_pattern[1]=0
        a = a.dimshuffle(dim_pattern)
        b = T.Sum(pattern)(a)
        val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
#        val = numpy.ones(shape)
#        val = numpy.arange(numpy.prod(shape)).reshape(shape)
        val = theano._asarray(val,dtype='float32')
        f = theano.function([a],b, mode=mode_with_gpu)
        f2 = theano.function([a],b, mode=mode_without_gpu)
        assert tcn.GpuSum in [x.op.__class__ for x in f.maker.env.toposort()]
        assert T.Sum in [x.op.__class__ for x in f2.maker.env.toposort()]
        assert _allclose(f2(val),f(val)), ('shape', shape, 'pattern', pattern, sum([shape[i] for i in pattern]))


        #test with broadcast
    for shape, pattern in [((5,),[0]),
                           ((5,4),[0,1]),((5,4),[0]),
                           ((5,4,3),[0]),((5,4,3),[0,1]),((5,4,3),[2]),((5,4,3),[0,1,2]),
                           ((5,4,3,2),[0,1,2,3]), ((5,4,3,2),[0,2,3])]:
        shape = numpy.asarray(shape)*2
        a = tensor.TensorType('float32',(False,)*len(shape))()
        a2 = tcn.CudaNdarrayType((False,)*len(shape))()
        b = T.Sum(pattern)(a)
        b2 = T.Sum(pattern)(a2)
        val = numpy.random.rand(numpy.prod(shape)).reshape(shape)
#        val = numpy.ones(shape)
#        val = numpy.arange(numpy.prod(shape)).reshape(shape)
        val = theano._asarray(val,dtype='float32')
        val2 = cuda.CudaNdarray(val)
        if len(shape)==1:
            val = val[::2]
            val2 = val2[::2]
        elif len(shape)==2:
            val = val[::2,::2]
            val2 = val2[::2,::2]
        elif len(shape)==3:
            val = val[::2,::2,::2]
            val2 = val2[::2,::2,::2]
        elif len(shape)==4:
            val = val[::2,::2,::2,::2]
            val2 = val2[::2,::2,::2,::2]
        f = theano.function([a],b, mode=mode_without_gpu)
        f2 = theano.function([a2],b2, mode=mode_with_gpu)
        assert tcn.GpuSum in [x.op.__class__ for x in f2.maker.env.toposort()]
        assert T.Sum in [x.op.__class__ for x in f.maker.env.toposort()]
        assert _allclose(f2(val2),f(val)), ('shape', shape, 'pattern', pattern, sum([shape[i] for i in pattern]))