示例#1
0
def local_cudnn_maxandargmax(node):
    if not isinstance(node.op, GpuMaxAndArgmax):
        return

    if not dnn_available(node.inputs[0].type.context_name):
        return

    if version(raises=False) < 6000:
        return

    if node.inputs[0].ndim > 8:
        return

    if node.inputs[0].dtype != node.outputs[0].dtype:
        return

    if node.inputs[0].dtype not in ["float16", "float32", "float64"]:
        return

    # order of the axes influences the output indices
    if node.op.axis is not None and tuple(sorted(
            node.op.axis)) != node.op.axis:
        return

    max, arg = GpuDnnReduction("maximum", node.op.axis, node.outputs[0].dtype,
                               node.outputs[0].dtype, True)(node.inputs[0])

    # cudnn can only return int32 indices
    return (
        max,
        as_gpuarray_variable(arg.astype("int64"),
                             node.outputs[1].type.context_name),
    )
示例#2
0
def local_dnn_argmax(op, ctx_name, inputs, outputs):
    if not dnn_available(ctx_name):
        return

    if version(raises=False) < 6000:
        return

    if inputs[0].ndim > 8:
        return

    if inputs[0].dtype not in ["float16", "float32", "float64"]:
        return

    # order of the axes influences the output indices
    if op.axis is not None and tuple(sorted(op.axis)) != op.axis:
        return

    max, arg = GpuDnnReduction("maximum", op.axis, inputs[0].dtype,
                               inputs[0].dtype, True)(*inputs)

    return [as_gpuarray_variable(arg.astype("int64"), ctx_name)]
示例#3
0
def local_abstractconv_cudnn_alt(node):
    if not isinstance(node.op, (AbstractConv2d, AbstractConv2d_gradWeights,
                                AbstractConv2d_gradInputs)):
        return

    if version(raises=False) < 6000 and node.op.filter_dilation != (1, 1):
        return None
    if node.op.unshared:
        return None
    if isinstance(node.op.border_mode, tuple) and any(
            isinstance(p, tuple) for p in node.op.border_mode):
        # Asymmetric padding not yet supported
        return None
    inp1 = node.inputs[0]
    inp2 = node.inputs[1]

    if not dnn_available(inp1.type.context_name):
        return

    op = node.op
    border_mode = node.op.border_mode
    subsample = node.op.subsample
    filter_dilation = node.op.filter_dilation
    num_groups = node.op.num_groups
    precision, _ = get_precision(None, [inp1, inp2])

    if node.op.filter_flip:
        conv_mode = "conv"
    else:
        conv_mode = "cross"

    if isinstance(op, AbstractConv2d):
        if border_mode == "half" or subsample != (1, 1) or num_groups != 1:
            return None
        if border_mode == "full":
            direction_hint = "bprop inputs"
        elif border_mode == "valid" and filter_dilation == (1, 1):
            direction_hint = "bprop weights"
        else:
            return None

        rval = dnn_conv(
            inp1,
            inp2,
            border_mode=border_mode,
            subsample=subsample,
            dilation=filter_dilation,
            direction_hint=direction_hint,
            conv_mode=conv_mode,
            num_groups=num_groups,
        )

    elif isinstance(op, AbstractConv2d_gradWeights):
        if (border_mode == "valid" and subsample == (1, 1)
                and filter_dilation == (1, 1) and num_groups == 1):
            img = gpu_contiguous(inp1)
            topgrad = gpu_contiguous(inp2)
            ctx_name = infer_context_name(img, topgrad)
            img = gpu_contiguous(img.dimshuffle(1, 0, 2, 3))
            topgrad = gpu_contiguous(topgrad.dimshuffle(1, 0, 2, 3))
            ishape = [shape_i_op(i)(img) for i in range(img.ndim)]
            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
            out_shp = get_conv_output_shape(
                ishape,
                tshape,
                border_mode=border_mode,
                subsample=subsample,
                filter_dilation=filter_dilation,
            )

            out_shp = assert_conv_shape(out_shp)
            out = GpuAllocEmpty(dtype=img.dtype,
                                context_name=ctx_name)(*out_shp)
            desc = GpuDnnConvDesc(
                border_mode=border_mode,
                subsample=subsample,
                dilation=filter_dilation,
                conv_mode="cross",
                precision=precision,
            )(out.shape)

            conv = GpuDnnConv(algo=None, num_groups=num_groups)(img, topgrad,
                                                                out, desc)
            if conv_mode == "conv":
                conv = conv[:, :, ::-1, ::-1]

            rval = as_gpuarray_variable(conv.dimshuffle(1, 0, 2, 3), ctx_name)
        else:
            return None

    elif isinstance(op, AbstractConv2d_gradInputs):
        if border_mode == "valid" and subsample == (1, 1) and num_groups == 1:
            kerns = gpu_contiguous(inp1.dimshuffle(1, 0, 2, 3))
            topgrad = gpu_contiguous(inp2)
            ctx_name = infer_context_name(kerns, topgrad)
            conv_mode = "cross" if conv_mode == "conv" else "conv"
            desc = GpuDnnConvDesc(
                border_mode="full",
                subsample=subsample,
                dilation=filter_dilation,
                conv_mode=conv_mode,
                precision=precision,
            )(kerns.shape)

            tshape = [shape_i_op(i)(topgrad) for i in range(topgrad.ndim)]
            kshape = [shape_i_op(i)(kerns) for i in range(kerns.ndim)]
            shape = get_conv_output_shape(
                tshape,
                kshape,
                border_mode="full",
                subsample=subsample,
                filter_dilation=filter_dilation,
            )

            shape = assert_conv_shape(shape)
            out = GpuAllocEmpty(dtype=topgrad.dtype,
                                context_name=ctx_name)(*shape)
            rval = GpuDnnConv(algo=None, num_groups=num_groups)(topgrad, kerns,
                                                                out, desc)
        else:
            return None

    return [rval]
示例#4
0
def local_dnn_reduction(node):
    if not isinstance(node.op, GpuCAReduceCuda):
        return

    if not dnn_available(node.inputs[0].type.context_name):
        return

    if version(raises=False) < 6000:
        return

    if node.inputs[0].ndim > 8:
        return

    acc_dtype = node.op._acc_dtype(node.inputs[0].dtype)

    if node.inputs[0].dtype != node.outputs[0].dtype:
        # We can mix float16 and float32, but not float64.
        if node.inputs[0].dtype == "float64" or node.outputs[
                0].dtype == "float64":
            return
        if acc_dtype != "float32":
            return

    if node.inputs[0].dtype not in ["float16", "float32", "float64"]:
        return

    if node.inputs[0].dtype == "float64" and acc_dtype != "float64":
        return

    if node.inputs[0].dtype == "float32" and acc_dtype != "float32":
        return

    if node.inputs[0].dtype == "float16" and acc_dtype == "float64":
        return

    def _identity(a):
        return a

    def _square(a):
        return GpuElemwise(theano.scalar.basic.sqr)(a)

    scal = node.op.scalar_op.name
    post = _identity

    if node.op.pre_scalar_op is not None:
        if isinstance(node.op.scalar_op, theano.scalar.basic.Add):
            if isinstance(node.op.pre_scalar_op, theano.scalar.basic.Sqr):
                scal = "norm2"
                post = _square
            elif isinstance(node.op.pre_scalar_op, theano.scalar.basic.Abs):
                scal = "norm1"
            else:
                return
        elif isinstance(node.op.scalar_op,
                        theano.scalar.basic.Maximum) and isinstance(
                            node.op.pre_scalar_op, theano.scalar.basic.Abs):
            scal = "absmax"
        else:
            return

    if not cudnn.cudnnReduceTensorOp_t.has_alias(scal):
        return

    with inherit_stack_trace(node.outputs):
        ret = GpuDnnReduction(scal, node.op.axis, acc_dtype, node.op.dtype,
                              False)(node.inputs[0])
        return [post(ret)]
    # We use FWD 2D to check it.
    # Based on documentation, algo small (CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM)
    # should support all configurations, for both v5.1, v6 and v7.
    inputs = theano.shared(np.zeros((1, 1, 2, 2), dtype=dtype))
    filters = theano.shared(np.zeros((1, 1, 2, 2), dtype=dtype))
    conv = dnn_conv(inputs, filters, precision=precision, algo="small")
    f = theano.function([], conv, mode=mode_with_gpu)
    try:
        f()
    except RuntimeError as e:
        assert "CUDNN_STATUS_ARCH_MISMATCH" in str(e)
        return False
    return True


cudnn = cudnn_defs.get_definitions(version(raises=False))


class ConvCase:
    """
    Helper class to describe a special test case quickly.
    This handles only 2D and 3D cases.
    """

    FWD, GRADINPUT, GRADWEIGHT = 0, 1, 2

    def __init__(
        self,
        type,
        inputs_shape,
        filters_shape,
示例#6
0
    # We use FWD 2D to check it.
    # Based on documentation, algo small (CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM)
    # should support all configurations, for both v5.1, v6 and v7.
    inputs = theano.shared(np.zeros((1, 1, 2, 2), dtype=dtype))
    filters = theano.shared(np.zeros((1, 1, 2, 2), dtype=dtype))
    conv = dnn_conv(inputs, filters, precision=precision, algo='small')
    f = theano.function([], conv, mode=mode_with_gpu)
    try:
        f()
    except RuntimeError as e:
        assert 'CUDNN_STATUS_ARCH_MISMATCH' in e.message
        return False
    return True


cudnn = cudnn_defs.get_definitions(version(raises=False))


class ConvCase:
    """
    Helper class to describe a special test case quickly.
    This handles only 2D and 3D cases.
    """

    FWD, GRADINPUT, GRADWEIGHT = 0, 1, 2

    def __init__(self, type,
                 inputs_shape, filters_shape,
                 algo=None, dtype=None, precision=None,
                 subsample=None, dilation=None, border_mode='valid',
                 conv_mode='conv', alpha=1, beta=0,