예제 #1
0
파일: basic_ops.py 프로젝트: liqin123/odin
def cudnn_available():
    """ return True if running on GPU with cuDNN available """
    if config['device'] == 'gpu':
        # theano backend
        if config['backend'] == 'theano':
            try:
                if package_installed(name='pygpu'):
                    from theano.gpuarray import dnn
                    from theano.gpuarray.type import list_contexts
                    return dnn.dnn_available(list_contexts()[0])
                else:
                    from theano.sandbox.cuda import dnn
                    return dnn.dnn_available()
            except ImportError:
                return False
        # tensorflow backend
        else:
            import commands
            if platform.system() == "Darwin":
                x = commands.getstatusoutput('ls /usr/local/cuda/lib')
                x = x[-1].split('\n')
            elif platform.version() == "Windows":
                raise Exception('No support for Windows')
            else:
                x = commands.getstatusoutput('ldconfig -p')
                x = x[-1].split('=>')
            if builtins.any('libcudnn' in i for i in x):
                return True
            else:
                return False
    return False
예제 #2
0
def local_cudnn_maxandargmax(node):
    if not isinstance(node.op, GpuMaxAndArgmax):
        return

    if not dnn_available(node.inputs[0].type.context_name):
        return

    if version(raises=False) < 6000:
        return

    if node.inputs[0].ndim > 8:
        return

    if node.inputs[0].dtype != node.outputs[0].dtype:
        return

    if node.inputs[0].dtype not in ["float16", "float32", "float64"]:
        return

    # order of the axes influences the output indices
    if node.op.axis is not None and tuple(sorted(
            node.op.axis)) != node.op.axis:
        return

    max, arg = GpuDnnReduction("maximum", node.op.axis, node.outputs[0].dtype,
                               node.outputs[0].dtype, True)(node.inputs[0])

    # cudnn can only return int32 indices
    return (
        max,
        as_gpuarray_variable(arg.astype("int64"),
                             node.outputs[1].type.context_name),
    )
예제 #3
0
def local_gpua_avg_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
        return
    if not op.ignore_border:
        return
    inp, out_grad, ws, stride, pad = inputs
    nd = op.ndim
    if nd not in (2, 3):
        return
    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
    out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
    mode = op.mode

    # the GPU ops expect exactly 2 non-pooling dimensions
    if inp.ndim == nd + 2:
        # We reuse out_grad because cuDNN does not use the value of the `out`
        # argument but still checks its shape for average pooling. This
        # has been observed in v2 and v3 as far as I know.
        return GpuDnnPoolGrad(mode=mode)(inp, out_grad, out_grad, ws, stride,
                                         pad)
    else:
        # reshape to 4D or 5D with 2 non-pooling dimensions
        inp_padded = pad_dims(inp, 2, nd)
        out_grad_padded = pad_dims(out_grad, 2, nd)
        ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded, out_grad_padded,
                                               out_grad_padded, ws, stride,
                                               pad)
        return unpad_dims(ret_padded, inp, 2, nd)
예제 #4
0
def local_gpua_pool_dnn_grad_stride(op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
        return
    if not op.ignore_border:
        return
    inp, out, out_grad, ws, stride, pad = inputs
    nd = op.ndim
    if nd not in (2, 3):
        return
    inp = gpu_contiguous(as_gpuarray_variable(inp, ctx_name))
    out = gpu_contiguous(as_gpuarray_variable(out, ctx_name))
    out_grad = gpu_contiguous(as_gpuarray_variable(out_grad, ctx_name))
    mode = op.mode

    # the GPU ops expect exactly 2 non-pooling dimensions
    if inp.ndim == nd + 2:
        return GpuDnnPoolGrad(mode=mode)(inp, out, out_grad, ws, stride, pad)
    else:
        # reshape to 4D or 5D with 2 non-pooling dimensions
        inp_padded = pad_dims(inp, 2, nd)
        out_padded = pad_dims(out, 2, nd)
        out_grad_padded = pad_dims(out_grad, 2, nd)
        ret_padded = GpuDnnPoolGrad(mode=mode)(inp_padded, out_padded,
                                               out_grad_padded, ws, stride,
                                               pad)
        return unpad_dims(ret_padded, inp, 2, nd)
예제 #5
0
def local_softmax_dnn(node):
    if isinstance(node.op, GpuSoftmax):
        if not dnn_available(node.outputs[0].type.context_name):
            return
        ins = node.inputs[0].dimshuffle(0, 1, "x", "x")
        ins = gpu_contiguous(ins)
        out = GpuDnnSoftmax("accurate", "channel")(ins)
        out = as_gpuarray_variable(out.dimshuffle(0, 1), out.type.context_name)
        return [out]
예제 #6
0
def local_gpua_softmax_dnn_grad(op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
        return
    ins = []
    for n in inputs:
        n = as_gpuarray_variable(n, ctx_name)
        if n.ndim != 2:
            return
        ins.append(n.dimshuffle(0, "x", 1, "x"))

    out = GpuDnnSoftmaxGrad("accurate", "instance")(gpu_contiguous(ins[0]),
                                                    gpu_contiguous(ins[1]))
    return [out.dimshuffle(0, 2)]
예제 #7
0
    def apply(self, fgraph):
        """
        Raise a error if cudnn can't be used.

        """
        for c in list_contexts():
            if not dnn_available(c):
                # Make an assert error as we want Theano to fail, not
                # just skip this optimization.
                raise AssertionError(
                    "cuDNN optimization was enabled, but Theano was not able "
                    "to use it for context " + str(c) +
                    ". We got this error: \n" + dnn_available.msg)
예제 #8
0
def local_gpua_logsoftmax_to_dnn(op, ctx_name, inputs, outputs):
    # Transform the input in the format expected by GpuDnnSoftmax
    inp = inputs[0]
    if inp.ndim != 2:
        return
    if not dnn_available(ctx_name):
        return

    inp = inp.dimshuffle(0, 1, "x", "x")
    inp.tag.context_name = ctx_name

    # Apply GpuDnnSoftmax and return the result
    out = GpuDnnSoftmax("log", "channel")(gpu_contiguous(inp))
    return [out.dimshuffle(0, 1)]
예제 #9
0
def local_mypool_dnn_alternative(node):
    if not dnn_available():
        return
    if isinstance(node.op, MyPool):
        if not node.op.ignore_border:
            return
        img, = node.inputs
        ds = node.op.ds
        stride = node.op.st
        pad = node.op.padding
        mode = node.op.mode
        if (img.owner and isinstance(img.owner.op, HostFromGpu)):
            ret = dnn_pool(gpu_contiguous(img.owner.inputs[0]),
                           ds, stride=stride, pad=pad, mode=mode)
            return [host_from_gpu(ret)]
예제 #10
0
    def run_test_case_gi(self,
                         i,
                         f,
                         o,
                         s,
                         b,
                         flip,
                         provide_shape,
                         fd=(1, 1),
                         expect_error=False):
        if not dnn_available(test_ctx_name):
            pytest.skip(dnn_available.msg)

        if fd != (1, 1):
            pytest.skip("Doesn't have CUDNN implementation")

        mode = mode_with_gpu

        if not expect_error:
            self.run_gradinput(
                inputs_shape=i,
                filters_shape=f,
                output_shape=o,
                subsample=s,
                verify_grad=True,
                mode=mode,
                provide_shape=provide_shape,
                border_mode=b,
                filter_flip=flip,
                target_op=GpuDnnConvGradI,
                filter_dilation=fd,
            )
        else:
            with pytest.raises((RuntimeError, ValueError)):
                self.run_gradinput(
                    inputs_shape=i,
                    filters_shape=f,
                    output_shape=o,
                    subsample=s,
                    verify_grad=False,
                    mode=mode,
                    provide_shape=provide_shape,
                    border_mode=b,
                    filter_flip=flip,
                    target_op=GpuDnnConvGradI,
                    ref=None,
                    filter_dilation=fd,
                )
예제 #11
0
 def test_import_without_gpu_or_cudnn_raises(self):
     if theano_backend == 'pygpu':
         from theano.gpuarray import dnn
         if dnn.dnn_present():
             pytest.skip()
     elif theano_backend == 'pygpu_sandbox':
         from theano.sandbox.gpuarray import dnn
         if dnn.dnn_present():
             pytest.skip()
     elif theano_backend == 'cuda_sandbox':
         from theano.sandbox.cuda import dnn
         if dnn.dnn_available():
             pytest.skip()
     else:
         with pytest.raises(ImportError):
             import lasagne.layers.dnn
예제 #12
0
    def run_test_case(self, i, f, s, b, flip, provide_shape, fd=(1, 1)):
        if not dnn_available(test_ctx_name):
            pytest.skip(dnn_available.msg)

        mode = mode_with_gpu

        if fd != (1, 1):
            pytest.skip("Doesn't have CUDNN implementation")

        o = self.get_output_shape(i, f, s, b, fd)

        self.run_fwd(
            inputs_shape=i,
            filters_shape=f,
            subsample=s,
            verify_grad=True,
            mode=mode,
            provide_shape=provide_shape,
            border_mode=b,
            filter_flip=flip,
            target_op=GpuDnnConv,
        )
        self.run_gradweight(
            inputs_shape=i,
            filters_shape=f,
            output_shape=o,
            subsample=s,
            verify_grad=True,
            mode=mode,
            provide_shape=provide_shape,
            border_mode=b,
            filter_flip=flip,
            target_op=GpuDnnConvGradW,
        )
        self.run_gradinput(
            inputs_shape=i,
            filters_shape=f,
            output_shape=o,
            subsample=s,
            verify_grad=True,
            mode=mode,
            provide_shape=provide_shape,
            border_mode=b,
            filter_flip=flip,
            target_op=GpuDnnConvGradI,
        )
예제 #13
0
    def setUp(self):
        """
        Set up a test image and filter to re-use.
        """
        skip_if_no_gpu()
        if not dnn_available():
            raise SkipTest('Skipping tests cause cudnn is not available')
        self.orig_floatX = theano.config.floatX
        theano.config.floatX = 'float32'
        self.image = np.random.rand(1, 1, 3, 3).astype(theano.config.floatX)
        self.image_tensor = tensor.tensor4()
        self.input_space = Conv2DSpace((3, 3), 1, axes=('b', 'c', 0, 1))
        self.filters_values = np.ones((1, 1, 2, 2), dtype=theano.config.floatX)
        self.filters = sharedX(self.filters_values, name='filters')
        self.batch_size = 1

        self.cudnn2d = Cudnn2D(self.filters, self.batch_size, self.input_space)
예제 #14
0
def local_gpua_pool_dnn_alternative(fgraph, op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
        return
    if not op.ignore_border:
        return
    img, ws, stride, pad = inputs
    nd = op.ndim
    if nd not in (2, 3):
        return
    img = gpu_contiguous(as_gpuarray_variable(img, ctx_name))
    mode = op.mode
    # dnn_pool expects exactly 2 non-pooling dimensions
    if img.ndim == nd + 2:
        return dnn_pool(img, ws, stride=stride, pad=pad, mode=mode)
    else:
        # reshape to 4D or 5D with 2 non-pooling dimensions
        img_padded = pad_dims(img, 2, nd)
        ret_padded = dnn_pool(img_padded, ws, stride=stride, pad=pad, mode=mode)
        return unpad_dims(ret_padded, img, 2, nd)
예제 #15
0
파일: leaf.py 프로젝트: ShigekiKarita/t712
 def _params_to_cudnn(self):
     from theano.gpuarray import dnn
     from theano.gpuarray.type import gpuarray_shared_constructor
     assert dnn.dnn_available(None)
     self._rnn_block = dnn.RNNBlock(theano.config.floatX,
                                    self.hidden_dim,
                                    num_layers=1,
                                    input_mode="linear",
                                    rnn_mode=self.rnn_type,
                                    direction_mode="unidirectional")
     param_size = self._rnn_block.get_param_size(
         [self.n_batch, self.input_dim])  # TODO: study about n_batch
     self.params = [gpuarray_shared_constructor(Constant(0.0)(param_size))]
     cs = self._rnn_block.split_params(self.params[0],
                                       layer=0,
                                       input_size=[
                                           self.n_batch, self.input_dim
                                       ])  # TODO: multi layer support
     for c, p in zip(cs, self.non_cudnn_params):
         c[:] = p.get_value(borrow=True, return_internal_type=True)
예제 #16
0
def local_dnn_argmax(op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
        return

    if version(raises=False) < 6000:
        return

    if inputs[0].ndim > 8:
        return

    if inputs[0].dtype not in ["float16", "float32", "float64"]:
        return

    # order of the axes influences the output indices
    if op.axis is not None and tuple(sorted(op.axis)) != op.axis:
        return

    max, arg = GpuDnnReduction("maximum", op.axis, inputs[0].dtype,
                               inputs[0].dtype, True)(*inputs)

    return [as_gpuarray_variable(arg.astype("int64"), ctx_name)]
예제 #17
0
def local_mypool_dnn_grad_stride(node):
    if not dnn_available():
        return
    if isinstance(node.op, MyMaxPoolGrad):
        if not node.op.ignore_border:
            return
        inp, out, inp_grad = node.inputs
        ds = node.op.ds
        st = node.op.st
        pad = node.op.padding
        mode = node.op.mode

        if ((inp.owner and isinstance(inp.owner.op, HostFromGpu)) or
            (out.owner and isinstance(out.owner.op, HostFromGpu)) or
            (inp_grad.owner and isinstance(inp_grad.owner.op,
                                           HostFromGpu))):

            ret = GpuDnnPoolGrad(mode=mode)(gpu_contiguous(inp),
                                            gpu_contiguous(out),
                                            gpu_contiguous(inp_grad),
                                            ds, st, pad)
            return [host_from_gpu(ret)]
예제 #18
0
def init_dev(dev, name=None, preallocate=None):
    global pygpu_activated
    global theano_gpu_is_already_active
    if (
        not theano_gpu_is_already_active
        and os.environ.get("THEANO_GPU_IS_ALREADY_ACTIVE", "") == "Yes"
    ):
        raise RuntimeError(
            "You can't initialize the GPU in a subprocess if the parent process already did it"
        )
    if not config.cxx:
        raise RuntimeError("The new gpu-backend need a c++ compiler.")
    pygpu_version = pygpu_parse_version(pygpu.__version__)
    if pygpu_version.major != 0 or pygpu_version.minor != 7 or pygpu_version.patch < 0:
        raise ValueError(
            "Your installed version of pygpu(%s) is too old, please upgrade to 0.7.0 or later (but below 0.8.0)"
            % pygpu_version.fullversion
        )
    # This is for the C headers API, we need to match the exact version.
    gpuarray_version_major_supported = 2
    gpuarray_version_major_detected = pygpu.gpuarray.api_version()[0]
    if gpuarray_version_major_detected != gpuarray_version_major_supported:
        raise ValueError(
            "Your installed version of libgpuarray is not in sync with the current Theano"
            f" version. The installed libgpuarray version supports API version {int(gpuarray_version_major_detected)},"
            f" while current Theano supports API version {int(gpuarray_version_major_supported)}. Change the version of"
            " libgpuarray or Theano to fix this problem.",
        )
    if dev not in init_dev.devmap:
        args = dict()
        if config.gpuarray__cache_path != "":
            args["kernel_cache_path"] = config.gpuarray__cache_path
        if preallocate is None:
            preallocate = config.gpuarray__preallocate
        if preallocate < 0:
            args["max_cache_size"] = 0
        else:
            args["initial_cache_size"] = preallocate
        context = pygpu.init(
            dev,
            sched=config.gpuarray__sched,
            single_stream=config.gpuarray__single_stream,
            **args,
        )
        os.environ["THEANO_GPU_IS_ALREADY_ACTIVE"] = "Yes"
        theano_gpu_is_already_active = True
        context.dev = dev
        init_dev.devmap[dev] = context
        reg_context(name, context)

        MB = 1024 * 1024
        if dev.startswith("cuda"):
            avail = dnn.dnn_available(name)
            # If we try to enable cudnn and there isn't enough GPU
            # memory, there will be an unclear error message. So do
            # not even try a clear error.
            if avail and context.free_gmem < 75 * MB:
                raise RuntimeError(
                    f"Can not enable cuDNN as there is only {int(context.free_gmem / MB)} MB of free GPU memory."
                )
            elif avail:
                context.cudnn_handle = dnn._make_handle(context)
            elif config.dnn__enabled == "True":
                raise RuntimeError(
                    "You enabled cuDNN, but we aren't able to use it: %s"
                    % dnn.dnn_available.msg
                )
            if config.print_active_device:
                if avail:
                    print(
                        f"Using cuDNN version {int(dnn.version())} on context {name}",
                        file=sys.stderr,
                    )
                else:
                    print(
                        f"Can not use cuDNN on context {name}: {dnn.dnn_available.msg}",
                        file=sys.stderr,
                    )
        if preallocate < 0:
            print(f"Disabling allocation cache on {dev}")
        elif preallocate > 0:
            if preallocate <= 1:
                gmem = min(preallocate, 0.95) * context.total_gmem
            else:
                gmem = preallocate * MB
            if gmem > context.free_gmem:
                raise RuntimeError(
                    f"Trying to preallocate {int(gmem / MB)} MB of GPU memory while only"
                    f" {int(context.free_gmem / MB)} MB are available."
                )
            elif gmem > context.free_gmem - 50 * MB:
                warnings.warn(
                    "Preallocating too much memory can prevent cudnn and cublas from working properly"
                )

            # This will allocate and immediately free an object of size gmem
            # which will reserve that amount of memory on the GPU.
            pygpu.empty((gmem,), dtype="int8", context=context)
            if config.print_active_device:
                print(
                    f"Preallocating {int(gmem // MB)}/{int(context.total_gmem // MB)} Mb ({gmem / context.total_gmem}) on {dev}",
                    file=sys.stderr,
                )

        # Initialise the blas kernels.  We do this after the
        # preallocation to not fragment the heap accidentally.
        tmp = pygpu.empty((2, 2), dtype="float32", context=context)
        if dev.startswith("cuda"):
            # In OpenCL, BLAS isn't always available
            pygpu.blas.gemm(0, tmp, tmp, 0, tmp, overwrite_c=True)
        del tmp
    else:
        context = init_dev.devmap[dev]
    # This will map the context name to the real context object.
    if config.print_active_device:
        try:
            unique_id = "(" + context.unique_id + ")"
        except pygpu.gpuarray.UnsupportedException:
            unique_id = ""

        print(
            f"Mapped name {name} to device {dev}: {context.devname} {unique_id}",
            file=sys.stderr,
        )
    pygpu_activated = True
예제 #19
0
def local_abstractconv_cudnn_alt(node):
    if not isinstance(node.op, (AbstractConv2d, AbstractConv2d_gradWeights,
                                AbstractConv2d_gradInputs)):
        return

    if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1):
        return None
    if node.op.unshared:
        return None
    if isinstance(node.op.border_mode, tuple) and any(
            isinstance(p, tuple) for p in node.op.border_mode):
        # Asymmetric padding not yet supported
        return None
    inp1 = node.inputs[0]
    inp2 = node.inputs[1]

    if not dnn_available(inp1.type.context_name):
        return

    op = node.op
    border_mode = node.op.border_mode
    subsample = node.op.subsample
    filter_dilation = node.op.filter_dilation
    num_groups = node.op.num_groups
    precision, _ = get_precision(None, [inp1, inp2])

    if node.op.filter_flip:
        conv_mode = "conv"
    else:
        conv_mode = "cross"

    if isinstance(op, AbstractConv2d):
        if border_mode == "half" or subsample != (1, 1) or num_groups != 1:
            return None
        if border_mode == "full":
            direction_hint = "bprop inputs"
        elif border_mode == "valid" and filter_dilation == (1, 1):
            direction_hint = "bprop weights"
        else:
            return None

        rval = dnn_conv(
            inp1,
            inp2,
            border_mode=border_mode,
            subsample=subsample,
            dilation=filter_dilation,
            direction_hint=direction_hint,
            conv_mode=conv_mode,
            num_groups=num_groups,
        )

    elif isinstance(op, AbstractConv2d_gradWeights):
        if (border_mode == "valid" and subsample == (1, 1)
                and filter_dilation == (1, 1) and num_groups == 1):
            img = gpu_contiguous(inp1)
            topgrad = gpu_contiguous(inp2)
            ctx_name = infer_context_name(img, topgrad)
            img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3))
            topgrad = gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3))
            ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
            out_shp = get_conv_output_shape(
                ishape,
                tshape,
                border_mode=border_mode,
                subsample=subsample,
                filter_dilation=filter_dilation,
            )

            out_shp = assert_conv_shape(out_shp)
            out = GpuAllocEmpty(dtype=img.dtype,
                                context_name=ctx_name)(*out_shp)
            desc = GpuDnnConvDesc(
                border_mode=border_mode,
                subsample=subsample,
                dilation=filter_dilation,
                conv_mode="cross",
                precision=precision,
            )(out.shape)

            conv = GpuDnnConv(algo=None, num_groups=num_groups)(img, topgrad,
                                                                out, desc)
            if conv_mode == "conv":
                conv = conv[:, :, ::-1, ::-1]

            rval = as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name)
        else:
            return None

    elif isinstance(op, AbstractConv2d_gradInputs):
        if border_mode == "valid" and subsample == (1, 1) and num_groups == 1:
            kerns = gpu_contiguous(inp1.dimshuffle(1, 0, 2, 3))
            topgrad = gpu_contiguous(inp2)
            ctx_name = infer_context_name(kerns, topgrad)
            conv_mode = "cross" if conv_mode == "conv" else "conv"
            desc = GpuDnnConvDesc(
                border_mode="full",
                subsample=subsample,
                dilation=filter_dilation,
                conv_mode=conv_mode,
                precision=precision,
            )(kerns.shape)

            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
            kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
            shape = get_conv_output_shape(
                tshape,
                kshape,
                border_mode="full",
                subsample=subsample,
                filter_dilation=filter_dilation,
            )

            shape = assert_conv_shape(shape)
            out = GpuAllocEmpty(dtype=topgrad.dtype,
                                context_name=ctx_name)(*shape)
            rval = GpuDnnConv(algo=None, num_groups=num_groups)(topgrad, kerns,
                                                                out, desc)
        else:
            return None

    return [rval]
예제 #20
0
def local_dnn_reduction(node):
    if not isinstance(node.op, GpuCAReduceCuda):
        return

    if not dnn_available(node.inputs[0].type.context_name):
        return

    if version(raises=False) < 6000:
        return

    if node.inputs[0].ndim > 8:
        return

    acc_dtype = node.op._acc_dtype(node.inputs[0].dtype)

    if node.inputs[0].dtype != node.outputs[0].dtype:
        # We can mix float16 and float32, but not float64.
        if node.inputs[0].dtype == "float64" or node.outputs[
                0].dtype == "float64":
            return
        if acc_dtype != "float32":
            return

    if node.inputs[0].dtype not in ["float16", "float32", "float64"]:
        return

    if node.inputs[0].dtype == "float64" and acc_dtype != "float64":
        return

    if node.inputs[0].dtype == "float32" and acc_dtype != "float32":
        return

    if node.inputs[0].dtype == "float16" and acc_dtype == "float64":
        return

    def _identity(a):
        return a

    def _square(a):
        return GpuElemwise(theano.scalar.basic.sqr)(a)

    scal = node.op.scalar_op.name
    post = _identity

    if node.op.pre_scalar_op is not None:
        if isinstance(node.op.scalar_op, theano.scalar.basic.Add):
            if isinstance(node.op.pre_scalar_op, theano.scalar.basic.Sqr):
                scal = "norm2"
                post = _square
            elif isinstance(node.op.pre_scalar_op, theano.scalar.basic.Abs):
                scal = "norm1"
            else:
                return
        elif isinstance(node.op.scalar_op,
                        theano.scalar.basic.Maximum) and isinstance(
                            node.op.pre_scalar_op, theano.scalar.basic.Abs):
            scal = "absmax"
        else:
            return

    if not cudnn.cudnnReduceTensorOp_t.has_alias(scal):
        return

    with inherit_stack_trace(node.outputs):
        ret = GpuDnnReduction(scal, node.op.axis, acc_dtype, node.op.dtype,
                              False)(node.inputs[0])
        return [post(ret)]
예제 #21
0
def test_dnn_rnn_lstm():
    if not dnn.dnn_available(test_ctx_name):
        raise SkipTest(dnn.dnn_available.msg)
    utt.seed_rng()

    # test params
    input_dim = 32
    hidden_dim = 16
    batch_size = 2
    depth = 3
    timesteps = 5

    # test code
    X = T.tensor3('X')
    Y = T.tensor3('Y')
    h0 = T.tensor3('h0')
    c0 = T.tensor3('c0')

    rnnb = dnn.RNNBlock(theano.config.floatX, hidden_dim, depth, 'lstm')
    psize = rnnb.get_param_size([batch_size, input_dim])
    params_cudnn = gpuarray_shared_constructor(
        np.zeros((psize, ), dtype=theano.config.floatX))

    model = Model()
    last_layer = WrapperLayer(X)
    last_dim = input_dim
    for i in range(depth):
        lstm = LSTM(last_dim,
                    hidden_dim,
                    last_layer,
                    s0=h0[i, :, :],
                    c0=c0[i, :, :])
        model.add_layer(lstm)
        last_layer = lstm
        last_dim = hidden_dim
        layer_params = lstm.get_params()
        dnn_params = rnnb.split_params(params_cudnn, i,
                                       [batch_size, input_dim])
        for j, p in enumerate(dnn_params):
            p[:] = layer_params[j].get_value(borrow=True,
                                             return_internal_type=True)

    def funcs(out, params):
        fn = theano.function([X, h0, c0], out, mode=mode_with_gpu)
        cost = T.mean((Y - out)**2)
        grad = T.grad(cost, [X, h0, c0] + params)
        grad_fn = theano.function([X, Y, h0, c0], grad, mode=mode_with_gpu)
        return fn, grad_fn

    ref_fn, ref_grad_fn = funcs(last_layer.output(), model.get_params())
    cudnn_fn, cudnn_grad_fn = funcs(
        rnnb.apply(params_cudnn, X, h0, c0)[0], [params_cudnn])

    x_val = np.random.random(
        (timesteps, batch_size, input_dim)).astype(theano.config.floatX)
    y_val = np.random.random(
        (timesteps, batch_size, hidden_dim)).astype(theano.config.floatX)
    h0_val = np.random.random(
        (depth, batch_size, hidden_dim)).astype(theano.config.floatX)
    c0_val = np.random.random(
        (depth, batch_size, hidden_dim)).astype(theano.config.floatX)

    ref_out = ref_fn(x_val, h0_val, c0_val)
    cudnn_out = cudnn_fn(x_val, h0_val, c0_val)

    utt.assert_allclose(ref_out, cudnn_out)

    ref_grads = ref_grad_fn(x_val, y_val, h0_val, c0_val)
    cudnn_grads = cudnn_grad_fn(x_val, y_val, h0_val, c0_val)

    utt.assert_allclose(ref_grads[0], cudnn_grads[0])
    utt.assert_allclose(ref_grads[1], cudnn_grads[1])
    utt.assert_allclose(ref_grads[2], cudnn_grads[2])

    ref_grads_params = ref_grads[3:]
    cudnn_grads_params = gpuarray_shared_constructor(cudnn_grads[3])

    for i in range(depth):
        cudnn_grads_layer = rnnb.split_params(cudnn_grads_params, i,
                                              [batch_size, input_dim])
        ref_grads_layer = ref_grads_params[i * len(cudnn_grads_layer):(i + 1) *
                                           len(cudnn_grads_layer)]
        for j, g in enumerate(cudnn_grads_layer):
            utt.assert_allclose(ref_grads_layer[j], g)
예제 #22
0
from . import leaf, optimizer, initializer, logger

try:
    import theano
    from theano.gpuarray import ContextNotDefined
    from theano.gpuarray.dnn import dnn_available
    assert dnn_available(None)
    assert theano.config.dnn.enabled != "False"
except (ImportError, AssertionError, ContextNotDefined):
    logger.logger.warning("cuDNN is unavailable")