示例#1
0
    def run_gradinput(self, inputs_shape, filters_shape, subsample=(1, 1, 1)):
        inputs_shape = [inputs_shape[i] for i in (0, 4, 1, 2, 3)]
        filters_shape = [filters_shape[i] for i in (0, 4, 1, 2, 3)]

        inputs_val = np.random.random(inputs_shape).astype(config.floatX)
        filters_val = np.random.random(filters_shape).astype(config.floatX)
        inputs = gpuarray_shared_constructor(inputs_val)
        filters = gpuarray_shared_constructor(filters_val)

        bottom_height = (inputs_shape[2] - 1) * subsample[0] + filters_shape[2]
        bottom_width = (inputs_shape[3] - 1) * subsample[1] + filters_shape[3]
        bottom_depth = (inputs_shape[4] - 1) * subsample[2] + filters_shape[4]
        bottom_shape = gpuarray_shared_constructor(
            np.array([bottom_height, bottom_width, bottom_depth]))

        if subsample == (1, 1, 1):
            conv_ref = Corr3dMMGradInputs(subsample=subsample)(
                kern=ref_cast(filters), topgrad=ref_cast(inputs))
            conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(
                kern=filters, topgrad=inputs)
        else:
            conv_ref = Corr3dMMGradInputs(subsample=subsample)(
                kern=ref_cast(filters),
                topgrad=ref_cast(inputs),
                shape=bottom_shape)
            conv_gemm = GpuCorr3dMM_gradInputs(subsample=subsample)(
                kern=filters, topgrad=inputs, shape=bottom_shape)

        f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
        f = theano.function([], conv_gemm, mode=mode_with_gpu)

        res_ref = f_ref()
        res = f()
        utt.assert_allclose(res_ref, res)
示例#2
0
    def run_gradweight(self, inputs_shape, filters_shape, dCdH_shape, subsample=(1, 1)):
        inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
        filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]
        dCdH_shape = [dCdH_shape[i] for i in (0, 3, 1, 2)]

        inputs_val = np.random.random(inputs_shape).astype(config.floatX)
        dCdH_val = np.random.random(dCdH_shape).astype(config.floatX)
        inputs = gpuarray_shared_constructor(inputs_val)
        dCdH = gpuarray_shared_constructor(dCdH_val)
        shape = gpuarray_shared_constructor(np.array(filters_shape[2:]))

        if subsample == (1, 1):
            conv_ref = CorrMM_gradWeights(subsample=subsample)(
                ref_cast(inputs), ref_cast(dCdH)
            )
            conv_gemm = GpuCorrMM_gradWeights(subsample=subsample)(inputs, dCdH)
        else:
            conv_ref = CorrMM_gradWeights(subsample=subsample)(
                ref_cast(inputs), ref_cast(dCdH), shape=shape
            )
            conv_gemm = GpuCorrMM_gradWeights(subsample=subsample)(
                inputs, dCdH, shape=shape
            )

        f_ref = theano.function([], conv_ref, mode=mode_without_gpu)
        f = theano.function([], conv_gemm, mode=mode_with_gpu)

        res_ref = f_ref()
        res = f()
        utt.assert_allclose(res_ref, res)
示例#3
0
def test_gpuarray_shared_scalar():
    # By default, we don't put scalar as shared variable on the GPU
    with pytest.raises(TypeError):
        gpuarray_shared_constructor(np.asarray(1, dtype="float32"))

    # But we can force that
    gpuarray_shared_constructor(np.asarray(1, dtype="float32"),
                                target=test_ctx_name)
示例#4
0
    def run_conv_valid(
        self,
        inputs_shape,
        filters_shape,
        border_mode="valid",
        filter_dilation=(1, 1),
        subsample=(1, 1),
        unshared=False,
        verify_grad=False,
    ):
        inputs_shape = [inputs_shape[i] for i in (0, 3, 1, 2)]
        if unshared:
            filters_shape = [filters_shape[i] for i in (0, 1, 2, 5, 3, 4)]
        else:
            filters_shape = [filters_shape[i] for i in (0, 3, 1, 2)]

        inputs_val = np.random.random(inputs_shape).astype(config.floatX)
        filters_val = np.random.random(filters_shape).astype(config.floatX)

        inputs = gpuarray_shared_constructor(inputs_val)
        filters = gpuarray_shared_constructor(filters_val)

        conv_ref = CorrMM(
            border_mode=border_mode,
            filter_dilation=filter_dilation,
            subsample=subsample,
            unshared=unshared,
        )(ref_cast(inputs), ref_cast(filters))
        f_ref = theano.function([], conv_ref, mode=mode_without_gpu)

        conv = GpuCorrMM(
            border_mode=border_mode,
            filter_dilation=filter_dilation,
            subsample=subsample,
            unshared=unshared,
        )(inputs, filters)
        f = theano.function([], conv, mode=mode_with_gpu)

        res_ref = f_ref()
        res = f()
        utt.assert_allclose(res_ref, res)

        if verify_grad:
            utt.verify_grad(
                GpuCorrMM(
                    border_mode=border_mode,
                    filter_dilation=filter_dilation,
                    subsample=subsample,
                    unshared=unshared,
                ),
                [inputs_val, filters_val],
                mode=mode_with_gpu,
            )
示例#5
0
def test_overflow_gpu_new_backend():
    seed = 12345
    n_substreams = 7
    curr_rstate = np.array([seed] * 6, dtype="int32")
    rstate = [curr_rstate.copy()]
    for j in range(1, n_substreams):
        rstate.append(rng_mrg.ff_2p72(rstate[-1]))
    rstate = np.asarray(rstate)
    rstate = gpuarray_shared_constructor(rstate)
    fct = functools.partial(GPUA_mrg_uniform.new,
                            rstate,
                            ndim=None,
                            dtype="float32")
    # should raise error as the size overflows
    sizes = [
        (2**31, ),
        (2**32, ),
        (
            2**15,
            2**16,
        ),
        (2, 2**15, 2**15),
    ]
    rng_mrg_overflow(sizes, fct, mode, should_raise_error=True)
    # should not raise error
    sizes = [(2**5, ), (2**5, 2**5), (2**5, 2**5, 2**5)]
    rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
    # should support int32 sizes
    sizes = [(np.int32(2**10), ),
             (np.int32(2), np.int32(2**10), np.int32(2**10))]
    rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
示例#6
0
def test_overflow_gpu_new_backend():
    # run with THEANO_FLAGS=mode=FAST_RUN,init_gpu_device=cuda1,device=cpu
    from theano.gpuarray.tests.test_basic_ops import \
        mode_with_gpu as mode
    from theano.gpuarray.type import gpuarray_shared_constructor
    seed = 12345
    n_substreams = 7
    curr_rstate = numpy.array([seed] * 6, dtype='int32')
    rstate = [curr_rstate.copy()]
    for j in range(1, n_substreams):
        rstate.append(rng_mrg.ff_2p72(rstate[-1]))
    rstate = numpy.asarray(rstate)
    rstate = gpuarray_shared_constructor(rstate)
    fct = functools.partial(rng_mrg.GPUA_mrg_uniform.new, rstate,
                            ndim=None, dtype='float32')
    # should raise error as the size overflows
    sizes = [(2**31, ), (2**32, ), (2**15, 2**16,), (2, 2**15, 2**15)]
    rng_mrg_overflow(sizes, fct, mode, should_raise_error=True)
    # should not raise error
    sizes = [(2**5, ), (2**5, 2**5), (2**5, 2**5, 2**5)]
    rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
    # should support int32 sizes
    sizes = [(numpy.int32(2**10), ),
             (numpy.int32(2), numpy.int32(2**10), numpy.int32(2**10))]
    rng_mrg_overflow(sizes, fct, mode, should_raise_error=False)
示例#7
0
def test_validate_input_types_gpuarray_backend():
    from theano.sandbox.rng_mrg import mrg_uniform
    from theano.gpuarray.type import gpuarray_shared_constructor
    from theano.configparser import change_flags

    with change_flags(compute_test_value="raise"):
        rstate = np.zeros((7, 6), dtype="int32")
        rstate = gpuarray_shared_constructor(rstate)
        mrg_uniform.new(rstate, ndim=None, dtype="float32", size=(3, ))
示例#8
0
def test_validate_input_types_gpuarray_backend():
    from theano.sandbox.rng_mrg import mrg_uniform
    from theano.gpuarray.type import gpuarray_shared_constructor
    from theano.configparser import change_flags

    with change_flags(compute_test_value="raise"):
        rstate = numpy.zeros((7, 6), dtype="int32")
        rstate = gpuarray_shared_constructor(rstate)
        mrg_uniform.new(rstate, ndim=None, dtype="float32", size=(3,))
示例#9
0
def test_set_value_non_contiguous():
    s = gpuarray_shared_constructor(
        np.asarray([[1.0, 2.0], [1.0, 2.0], [5, 6]]))
    s.set_value(s.get_value(borrow=True, return_internal_type=True)[::2],
                borrow=True)
    assert not s.get_value(borrow=True,
                           return_internal_type=True).flags["C_CONTIGUOUS"]
    # In the past, this failed
    s.set_value([[0, 0], [1, 1]])
示例#10
0
def test_incsub_offset():
    # Test for https://github.com/Theano/Theano/issues/5670

    # Build a GPU variable which value will have an offset (x1)
    x = gpuarray_shared_constructor(np.zeros(5, dtype=theano.config.floatX))
    x1 = x[1:]
    # Use inc_subtensor on it
    y = tensor.vector()
    z = tensor.inc_subtensor(x1[2:], y)
    # Use updates so that inc_subtensor can happen inplace
    f = theano.function([y], z, updates={x: z}, mode=mode_with_gpu)
    utt.assert_allclose(f([1, 2]), np.array([0, 0, 1, 2], dtype=theano.config.floatX))
def test_consistency_GPUA_parallel():
    """
    Verify that the random numbers generated by GPUA_mrg_uniform, in
    parallel, are the same as the reference (Java) implementation by
    L'Ecuyer et al.

    """
    from theano.gpuarray.tests.test_basic_ops import \
        mode_with_gpu as mode
    from theano.gpuarray.type import gpuarray_shared_constructor

    seed = 12345
    n_samples = 5
    n_streams = 12
    n_substreams = 7  # 7 samples will be drawn in parallel

    samples = []
    curr_rstate = numpy.array([seed] * 6, dtype='int32')

    for i in range(n_streams):
        stream_samples = []
        rstate = [curr_rstate.copy()]
        for j in range(1, n_substreams):
            rstate.append(rng_mrg.ff_2p72(rstate[-1]))
        rstate = numpy.asarray(rstate)
        rstate = gpuarray_shared_constructor(rstate)

        new_rstate, sample = rng_mrg.GPUA_mrg_uniform.new(rstate, ndim=None,
                                                          dtype='float32',
                                                          size=(n_substreams,))
        rstate.default_update = new_rstate

        # Not really necessary, just mimicking
        # rng_mrg.MRG_RandomStreams' behavior
        sample.rstate = rstate
        sample.update = (rstate, new_rstate)

        # We need the sample back in the main memory
        cpu_sample = tensor.as_tensor_variable(sample)
        f = theano.function([], cpu_sample, mode=mode)

        for k in range(n_samples):
            s = f()
            stream_samples.append(s)

        samples.append(numpy.array(stream_samples).T.flatten())

        # next stream
        curr_rstate = rng_mrg.ff_2p134(curr_rstate)

    samples = numpy.array(samples).flatten()
    assert(numpy.allclose(samples, java_samples))
    def __init__(self, input_size, hidden_size, dtype=theano.config.floatX):
        self.grub = dnn.RNNBlock(dtype=dtype,
                                 hidden_size=hidden_size,
                                 num_layers=1,
                                 rnn_mode='gru')

        self.input_size = input_size
        self.hidden_size = hidden_size

        psize = self.grub.get_param_size((1, input_size))

        self.params = gpuarray_shared_constructor(
            np.zeros(psize, dtype=theano.config.floatX))
示例#13
0
    def __init__(self,rng,n_hidden,x,
                 E,xmask,is_train,dropout,mode='lstm',
                 n_layer=1, pre_state=None,**kwargs):

        self.is_train=is_train
        self.dropout=dropout

        self.rng=rng
        self.xmask=xmask

        shape=x.shape
        embd=E[x.flatten()]
        embd=embd.reshape([shape[0],shape[1],-1])

        if pre_state==None:
            h0 = T.zeros((n_layer, shape[1], n_hidden), dtype=theano.config.floatX)
            pre_state = [h0, ]
            if mode=='lstm':
                c0 = T.zeros((n_layer, shape[1], n_hidden), dtype=theano.config.floatX)
                pre_state.append(c0)


        rnnb=dnn.RNNBlock(dtype=theano.config.floatX,
                          hidden_size=n_hidden,
                          num_layers=n_layer,
                          rnn_mode=mode,
                          input_mode='skip',
                          direction_mode='unidirectional')
        psize=rnnb.get_param_size([1,n_hidden])
        print psize
        params_cudnn = gpuarray_shared_constructor(
            np.zeros((psize,), dtype=theano.config.floatX)
        )
        #l = np.sqrt(6.) / np.sqrt(4 * n_hidden)
        #pvalue = np.asarray(self.rng.uniform(low=-l, high=l, size=(psize,)), dtype=theano.config.floatX)
        #params_cudnn=gpuarray_shared_constructor(pvalue,name='cudnn')
        self.params=[params_cudnn,]

        if mode=='lstm':
            h=rnnb.apply(params_cudnn,embd,pre_state[0],pre_state[1])[0]
        else:
            h=rnnb.apply(params_cudnn,embd,pre_state[0])[0]

        h=h*self.xmask.dimshuffle(0,1,'x')

        # Dropout
        if self.dropout > 0:
            drop_mask = self.rng.binomial(n=1, p=1 - self.dropout, size=h.shape, dtype=theano.config.floatX)
            self.activation = T.switch(self.is_train, h * drop_mask, h * (1 - self.dropout))
        else:
            self.activation = T.switch(self.is_train, h, h)
示例#14
0
def test_consistency_GPUA_serial():
    # Verify that the random numbers generated by GPUA_mrg_uniform, serially,
    # are the same as the reference (Java) implementation by L'Ecuyer et al.
    from theano.gpuarray.tests.config import mode_with_gpu as mode
    from theano.gpuarray.type import gpuarray_shared_constructor

    seed = 12345
    n_samples = 5
    n_streams = 12
    n_substreams = 7

    samples = []
    curr_rstate = numpy.array([seed] * 6, dtype='int32')

    for i in range(n_streams):
        stream_rstate = curr_rstate.copy()
        for j in range(n_substreams):
            substream_rstate = numpy.array([stream_rstate.copy()],
                                           dtype='int32')
            # Transfer to device
            rstate = gpuarray_shared_constructor(substream_rstate)

            new_rstate, sample = rng_mrg.GPUA_mrg_uniform.new(rstate,
                                                              ndim=None,
                                                              dtype='float32',
                                                              size=(1,))
            rstate.default_update = new_rstate

            # Not really necessary, just mimicking
            # rng_mrg.MRG_RandomStreams' behavior
            sample.rstate = rstate
            sample.update = (rstate, new_rstate)

            # We need the sample back in the main memory
            cpu_sample = tensor.as_tensor_variable(sample)
            f = theano.function([], cpu_sample, mode=mode)
            for k in range(n_samples):
                s = f()
                samples.append(s)

            # next substream
            stream_rstate = rng_mrg.ff_2p72(stream_rstate)

        # next stream
        curr_rstate = rng_mrg.ff_2p134(curr_rstate)

    samples = numpy.array(samples).flatten()
    assert(numpy.allclose(samples, java_samples))
示例#15
0
    def test_blocksparse_grad_merge(self):
        b = tensor.fmatrix()
        h = tensor.ftensor3()
        iIdx = tensor.lmatrix()
        oIdx = tensor.lmatrix()

        W_val, h_val, iIdx_val, b_val, oIdx_val = self.gemv_data()
        W = gpuarray_shared_constructor(W_val, context=test_ctx_name)

        o = gpu_sparse_block_gemv(b.take(oIdx, axis=0), W, h, iIdx, oIdx)
        gW = theano.grad(o.sum(), W)

        lr = np.asarray(0.05, dtype="float32")

        upd = W - lr * gW

        f1 = theano.function([h, iIdx, b, oIdx],
                             updates=[(W, upd)],
                             mode=mode_with_gpu)

        # Make sure the lr update was merged.
        assert isinstance(f1.maker.fgraph.outputs[0].owner.op,
                          GpuSparseBlockOuter)

        # Exclude the merge optimizations.
        mode = mode_with_gpu.excluding("local_merge_blocksparse_alpha")
        mode = mode.excluding("local_merge_blocksparse_output")

        f2 = theano.function([h, iIdx, b, oIdx], updates=[(W, upd)], mode=mode)

        # Make sure the lr update is not merged.
        assert not isinstance(f2.maker.fgraph.outputs[0].owner.op,
                              GpuSparseBlockOuter)

        f2(h_val, iIdx_val, b_val, oIdx_val)
        W_ref = W.get_value()

        # reset the var
        W.set_value(W_val)
        f1(h_val, iIdx_val, b_val, oIdx_val)
        W_opt = W.get_value()

        utt.assert_allclose(W_ref, W_opt)
示例#16
0
def test_Gpujoin_inplace():
    # Test Gpujoin to work inplace.
    #
    # This function tests the case when several elements are passed to the
    # Gpujoin function but all except one of them are empty. In this case
    # Gpujoin should work inplace and the output should be the view of the
    # non-empty element.
    s = tt.lscalar()
    data = np.array([3, 4, 5], dtype=theano.config.floatX)
    x = gpuarray_shared_constructor(data, borrow=True)
    z = tt.zeros((s, ))

    join = GpuJoin(view=0)
    c = join(0, x, z)

    f = theano.function([s], theano.Out(c, borrow=True))
    if not isinstance(mode_with_gpu, theano.compile.DebugMode):
        assert x.get_value(borrow=True, return_internal_type=True) is f(0)
    assert np.allclose(f(0), [3, 4, 5])
示例#17
0
 def _params_to_cudnn(self):
     from theano.gpuarray import dnn
     from theano.gpuarray.type import gpuarray_shared_constructor
     assert dnn.dnn_available(None)
     self._rnn_block = dnn.RNNBlock(theano.config.floatX,
                                    self.hidden_dim,
                                    num_layers=1,
                                    input_mode="linear",
                                    rnn_mode=self.rnn_type,
                                    direction_mode="unidirectional")
     param_size = self._rnn_block.get_param_size(
         [self.n_batch, self.input_dim])  # TODO: study about n_batch
     self.params = [gpuarray_shared_constructor(Constant(0.0)(param_size))]
     cs = self._rnn_block.split_params(self.params[0],
                                       layer=0,
                                       input_size=[
                                           self.n_batch, self.input_dim
                                       ])  # TODO: multi layer support
     for c, p in zip(cs, self.non_cudnn_params):
         c[:] = p.get_value(borrow=True, return_internal_type=True)
示例#18
0
def test_cpu_target_with_shared_variable():
    srng = MRG_RandomStream()
    s = np.random.rand(2, 3).astype("float32")
    x = gpuarray_shared_constructor(s, name="x")
    try:
        # To have theano.shared(x) try to move on the GPU
        theano.compile.shared_constructor(gpuarray_shared_constructor)
        y = srng.uniform(x.shape, target="cpu")
        y.name = "y"
        z = (x * y).sum()
        z.name = "z"

        fz = theano.function([], z, mode=mode)

        nodes = fz.maker.fgraph.toposort()
        assert not any(
            [isinstance(node.op, GPUA_mrg_uniform) for node in nodes])
    finally:
        theano.compile.shared_constructor(gpuarray_shared_constructor,
                                          remove=True)
示例#19
0
def test_GPUA_full_fill():
    # Make sure the whole sample buffer is filled.  Also make sure
    # large samples are consistent with CPU results.
    import theano.gpuarray.tests.config
    from theano.gpuarray.type import gpuarray_shared_constructor

    # This needs to be large to trigger the problem on GPU
    size = (10, 1000)

    R = MRG_RandomStreams(234, use_cuda=False)
    uni = R.uniform(size, nstreams=60 * 256)
    f_cpu = theano.function([], uni)

    rstate_gpu = gpuarray_shared_constructor(R.state_updates[-1][0].get_value())
    new_rstate, sample = rng_mrg.GPUA_mrg_uniform.new(rstate_gpu, ndim=None,
                                                      dtype='float32',
                                                      size=size)
    rstate_gpu.default_update = new_rstate
    f_gpu = theano.function([], sample)

    utt.assert_allclose(f_cpu(), f_gpu())
示例#20
0
def test_elemwise_pow():
    # Test that GpuElemwise(pow) can compile with any combination of integer
    # or float input dtype.
    dtypes = [
        "uint8",
        "uint16",
        "uint32",
        "uint64",
        "int8",
        "int16",
        "int32",
        "int64",
        "float16",
        "float32",
        "float64",
    ]

    for dtype_base in dtypes:
        for dtype_exp in dtypes:

            # Compile a gpu function with the specified dtypes
            base_val = np.random.randint(0, 5, size=10).astype(dtype_base)
            exp_val = np.random.randint(0, 3, size=10).astype(dtype_exp)

            base = theano.tensor.vector(dtype=dtype_base)
            exp = gpuarray_shared_constructor(exp_val)
            assert exp.dtype == dtype_exp
            output = base**exp
            f = theano.function([base], output, mode=mode_with_gpu)
            # We don't transfer to the GPU when the output dtype is int*
            n = len([
                n for n in f.maker.fgraph.apply_nodes
                if isinstance(n.op, GpuElemwise)
            ])
            assert n == (output.dtype in tensor.float_dtypes)

            # Call the function to make sure the output is valid
            out = f(base_val)
            expected_out = base_val**exp_val
            assert_allclose(out, expected_out)
示例#21
0
    def __init__(self,
                 rng,
                 n_hidden,
                 x,
                 xmask,
                 is_train,
                 dropout,
                 mode='gru',
                 n_layer=1,
                 pre_state=None,
                 **kwargs):

        prefix = "BiGRU_"
        Wc = norm_weight(n_hidden * 2, n_hidden, name=prefix + 'Wc')
        bc = zero_bias(n_hidden, prefix + 'bc')

        self.is_train = is_train
        self.dropout = dropout

        self.rng = rng
        self.xmask = xmask

        if pre_state == None:
            h0 = T.zeros((n_layer, x.shape[1], n_hidden),
                         dtype=theano.config.floatX)
            pre_state = [
                h0,
            ]
            if mode == 'lstm':
                c0 = T.zeros((n_layer, x.shape[1], n_hidden),
                             dtype=theano.config.floatX)
                pre_state.append(c0)

        rnnb = dnn.RNNBlock(dtype=theano.config.floatX,
                            hidden_size=n_hidden,
                            num_layers=n_layer,
                            rnn_mode=mode,
                            input_mode='skip',
                            direction_mode='bidirectional')
        psize = rnnb.get_param_size([1, n_hidden])
        print psize
        params_cudnn = gpuarray_shared_constructor(
            np.zeros((psize, ), dtype=theano.config.floatX))
        #l = np.sqrt(6.) / np.sqrt(4 * n_hidden)
        #pvalue = np.asarray(self.rng.uniform(low=-l, high=l, size=(psize,)), dtype=theano.config.floatX)
        #params_cudnn=gpuarray_shared_constructor(pvalue,name='cudnn')
        self.params = [
            params_cudnn,
        ]

        if mode == 'lstm':
            h = rnnb.apply(params_cudnn, x, pre_state[0], pre_state[1])[0]
        else:
            h = rnnb.apply(params_cudnn, x, pre_state[0])[0]

        h = h * self.xmask.dimshuffle(0, 1, 'x')
        self.context = h

        ctx_mean = (h *
                    self.xmask[:, :, None]).sum(0) / self.xmask.sum(0)[:, None]

        self.activation = T.tanh(T.dot(ctx_mean, Wc) + bc)

        # Dropout
        if self.dropout > 0:
            drop_mask = self.rng.binomial(n=1,
                                          p=1 - self.dropout,
                                          size=h.shape,
                                          dtype=theano.config.floatX)
            self.activation = T.switch(self.is_train, h * drop_mask,
                                       h * (1 - self.dropout))
        else:
            self.activation = T.switch(self.is_train, h, h)
示例#22
0
 def shared(x, **kwargs):
     return gpuarray_shared_constructor(x, target=test_ctx_name, **kwargs)
示例#23
0
    def __init__(self, num_layers=1, direction=0, **kwargs):
        # this has to be provided in THEANO_FLAGS as e.g. contexts=gpu0->cuda0
        context_name = kwargs.get('device', str(theano.config.device))
        #if context_name == 'cpu':
        #  context_name = 'gpu0'
        kwargs['device'] = context_name
        #kwargs['n_out'] *= 2
        super(RNNBlockLayer, self).__init__(**kwargs)
        self.params = {}
        #self.attrs['n_out'] /= 2
        #self.set_attr('nout', self.attrs['n_out'] / 4)
        from theano.gpuarray import dnn
        from theano.gpuarray.type import gpuarray_shared_constructor
        from theano.tensor.extra_ops import cpu_contiguous
        #from theano.sandbox.cuda.basic_ops import gpu_contiguous

        rnnb = dnn.RNNBlock(
            dtype=theano.config.floatX,
            hidden_size=self.attrs['n_out'],
            num_layers=num_layers,
            rnn_mode='lstm',
            input_mode='linear',
            direction_mode='unidirectional'
            if direction != 0 else 'bidirectional',
            context_name=context_name if context_name != 'cpu' else 'gpu0')

        buffer_size = 1  # self.attrs['n_out'] * num_layers
        #X = self.get_linear_forward_output()
        #X = T.concatenate([s.output for s in self.sources],axis=2)[::direction or 1]
        X = cpu_contiguous(
            T.concatenate([s.output for s in self.sources],
                          axis=2)[::direction or 1])
        #X = cpu_contiguous(self.sources[0].output[::direction or 1])
        #X = T.concatenate([X,T.zeros((X.shape[0],batch_size - X.shape[1] + 1,X.shape[2]),X.dtype)],axis=1)[:,:-1]
        n_in = sum([s.attrs['n_out'] for s in self.sources])
        psize = rnnb.get_param_size([buffer_size, n_in])
        l = numpy.sqrt(6.) / numpy.sqrt(4 * self.attrs['n_out'])
        pvalue = numpy.asarray(self.rng.uniform(low=-l, high=l,
                                                size=(psize, )),
                               dtype=theano.config.floatX)
        if context_name == 'cpu':
            params_cudnn = self.add_param(
                self.create_bias(psize, name='cudnn_%s' % self.name))
        else:
            params_cudnn = self.add_param(
                gpuarray_shared_constructor(pvalue,
                                            target=context_name,
                                            name='cudnn_%s' % self.name))
        c_init = cpu_contiguous(
            T.alloc(numpy.cast[theano.config.floatX](0), num_layers,
                    X.shape[1], self.attrs['n_out']))
        h_init = cpu_contiguous(
            T.alloc(numpy.cast[theano.config.floatX](0), num_layers,
                    X.shape[1], self.attrs['n_out']))

        W_out = self.add_param(
            self.create_random_uniform_weights(
                self.attrs['n_out'], self.y_in[self.attrs['target']].n_out))
        b_out = self.add_param(
            self.create_bias(self.y_in[self.attrs['target']].n_out))

        if context_name == 'cpu':
            self.cost_val = T.constant(0)
            self.error_val = T.constant(0)
            self.known_grads = {}
            return

        out = rnnb.apply(params_cudnn, X, h_init, c_init)[0]
        out = out[::-1]
        out = T.dot(out, W_out) + b_out
        self.y_m = out.reshape((out.shape[0] * out.shape[1], out.shape[2]))

        self.i = (self.index.flatten() > 0).nonzero()
        self.y_data_flat = self.y_in[self.attrs['target']].flatten()
        nll, _ = T.nnet.crossentropy_softmax_1hot(
            x=self.y_m[self.i], y_idx=self.y_data_flat[self.i])
        self.cost_val = T.sum(nll)

        #self.cost_val = -T.sum(T.log(out[:,self.y_in[self.attrs['target']].flatten()][(self.index.flatten()>0).nonzero()]))
        self.known_grads = {params_cudnn: T.grad(self.cost_val, params_cudnn)}
        self.output = out
        self.index = self.sources[0].index

        self.error_val = T.sum(
            T.neq(T.argmax(self.y_m[self.i], axis=-1),
                  self.y_data_flat[self.i]))
示例#24
0
def test_dnn_rnn_lstm():
    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)
    utt.seed_rng()

    # test params
    input_dim = 32
    hidden_dim = 16
    batch_size = 2
    depth = 3
    timesteps = 5

    # test code
    X = T.tensor3('X')
    Y = T.tensor3('Y')
    h0 = T.tensor3('h0')
    c0 = T.tensor3('c0')

    rnnb = dnn.RNNBlock(theano.config.floatX, hidden_dim, depth, 'lstm')
    psize = rnnb.get_param_size([batch_size, input_dim])
    params_cudnn = gpuarray_shared_constructor(
        np.zeros((psize, ), dtype=theano.config.floatX))

    model = Model()
    last_layer = WrapperLayer(X)
    last_dim = input_dim
    for i in range(depth):
        lstm = LSTM(last_dim,
                    hidden_dim,
                    last_layer,
                    s0=h0[i, :, :],
                    c0=c0[i, :, :])
        model.add_layer(lstm)
        last_layer = lstm
        last_dim = hidden_dim
        layer_params = lstm.get_params()
        dnn_params = rnnb.split_params(params_cudnn, i,
                                       [batch_size, input_dim])
        for j, p in enumerate(dnn_params):
            p[:] = layer_params[j].get_value(borrow=True,
                                             return_internal_type=True)

    def funcs(out, params):
        fn = theano.function([X, h0, c0], out, mode=mode_with_gpu)
        cost = T.mean((Y - out)**2)
        grad = T.grad(cost, [X, h0, c0] + params)
        grad_fn = theano.function([X, Y, h0, c0], grad, mode=mode_with_gpu)
        return fn, grad_fn

    ref_fn, ref_grad_fn = funcs(last_layer.output(), model.get_params())
    cudnn_fn, cudnn_grad_fn = funcs(
        rnnb.apply(params_cudnn, X, h0, c0)[0], [params_cudnn])

    x_val = np.random.random(
        (timesteps, batch_size, input_dim)).astype(theano.config.floatX)
    y_val = np.random.random(
        (timesteps, batch_size, hidden_dim)).astype(theano.config.floatX)
    h0_val = np.random.random(
        (depth, batch_size, hidden_dim)).astype(theano.config.floatX)
    c0_val = np.random.random(
        (depth, batch_size, hidden_dim)).astype(theano.config.floatX)

    ref_out = ref_fn(x_val, h0_val, c0_val)
    cudnn_out = cudnn_fn(x_val, h0_val, c0_val)

    utt.assert_allclose(ref_out, cudnn_out)

    ref_grads = ref_grad_fn(x_val, y_val, h0_val, c0_val)
    cudnn_grads = cudnn_grad_fn(x_val, y_val, h0_val, c0_val)

    utt.assert_allclose(ref_grads[0], cudnn_grads[0])
    utt.assert_allclose(ref_grads[1], cudnn_grads[1])
    utt.assert_allclose(ref_grads[2], cudnn_grads[2])

    ref_grads_params = ref_grads[3:]
    cudnn_grads_params = gpuarray_shared_constructor(cudnn_grads[3])

    for i in range(depth):
        cudnn_grads_layer = rnnb.split_params(cudnn_grads_params, i,
                                              [batch_size, input_dim])
        ref_grads_layer = ref_grads_params[i * len(cudnn_grads_layer):(i + 1) *
                                           len(cudnn_grads_layer)]
        for j, g in enumerate(cudnn_grads_layer):
            utt.assert_allclose(ref_grads_layer[j], g)
示例#25
0
def test_validate_input_types_gpuarray_backend():
    with config.change_flags(compute_test_value="raise"):
        rstate = np.zeros((7, 6), dtype="int32")
        rstate = gpuarray_shared_constructor(rstate)
        rng_mrg.mrg_uniform.new(rstate, ndim=None, dtype="float32", size=(3, ))
X = T.tensor3('X')
Y = T.tensor3('Y')
h0 = T.tensor3('h0')
c0 = T.tensor3('c0')

rnnb = dnn.RNNBlock(
    theano.config.floatX,
    hidden_dim,
    depth,
    network_type,
    input_mode='skip'
)
psize = rnnb.get_param_size([batch_size, hidden_dim])
params_cudnn = gpuarray_shared_constructor(
    np.zeros((psize,), dtype=theano.config.floatX)
)

# lstm = LSTM(input_dim, hidden_dim)
output = rnnb.apply(params_cudnn, X, h0, c0)[0]  # Only hidden states
cost = T.mean((Y - output) ** 2)
grads = T.grad(cost, params_cudnn)
cudnn_fn = theano.function(
    inputs=[],
    outputs=output,
    mode=mode_with_gpu,
    givens={X: x_val, h0: h0_val, c0: c0_val}
)
cudnn_grad_fn = theano.function(
    inputs=[],
    outputs=grads,