示例#1
0
def ptx_nanosleep(context, builder, sig, args):
    nanosleep = ir.InlineAsm(ir.FunctionType(ir.VoidType(), [ir.IntType(32)]),
                             "nanosleep.u32 $0;",
                             'r',
                             side_effect=True)
    ns = args[0]
    builder.call(nanosleep, [ns])
示例#2
0
def integer_to_float16_cast(context, builder, fromty, toty, val):
    bitwidth = fromty.bitwidth
    constraint = float16_int_constraint(bitwidth)
    signedness = 's' if fromty.signed else 'u'

    fnty = ir.FunctionType(ir.IntType(16), [context.get_value_type(fromty)])
    asm = ir.InlineAsm(fnty, f"cvt.rn.f16.{signedness}{bitwidth} $0, $1;",
                       f"=h,{constraint}")
    return builder.call(asm, [val])
示例#3
0
def float_to_float16_cast(context, builder, fromty, toty, val):
    if fromty.bitwidth == toty.bitwidth:
        return val

    ty, constraint = float16_float_ty_constraint(fromty.bitwidth)

    fnty = ir.FunctionType(ir.IntType(16), [context.get_value_type(fromty)])
    asm = ir.InlineAsm(fnty, f"cvt.rn.f16.{ty} $0, $1;", f"=h,{constraint}")
    return builder.call(asm, [val])
示例#4
0
def float16_to_integer_cast(context, builder, fromty, toty, val):
    bitwidth = toty.bitwidth
    constraint = float16_int_constraint(bitwidth)
    signedness = 's' if toty.signed else 'u'

    fnty = ir.FunctionType(context.get_value_type(toty), [ir.IntType(16)])
    asm = ir.InlineAsm(fnty, f"cvt.rni.{signedness}{bitwidth}.f16 $0, $1;",
                       f"={constraint},h")
    return builder.call(asm, [val])
示例#5
0
    def ptx_fp16_comparison(context, builder, sig, args):
        fnty = ir.FunctionType(ir.IntType(16),
                               [ir.IntType(16), ir.IntType(16)])
        asm = ir.InlineAsm(fnty, _fp16_cmp.format(op=op), '=h,h,h')
        result = builder.call(asm, args)

        zero = context.get_constant(types.int16, 0)
        int_result = builder.bitcast(result, ir.IntType(16))
        return builder.icmp_unsigned("!=", int_result, zero)
示例#6
0
 def test_inline_assembly(self):
     mod = self.module()
     foo = ir.Function(mod, ir.FunctionType(ir.VoidType(), []), 'foo')
     builder = ir.IRBuilder(foo.append_basic_block(''))
     asmty = ir.FunctionType(ir.IntType(32), [ir.IntType(32)])
     asm = ir.InlineAsm(asmty, "mov $1, $2", "=r,r", side_effect=True)
     builder.call(asm, [ir.Constant(ir.IntType(32), 123)])
     builder.ret_void()
     pat = 'call i32 asm sideeffect "mov $1, $2", "=r,r" ( i32 123 )'
     self.assertInText(pat, str(mod))
     self.assert_valid_ir(mod)
示例#7
0
 def mark_location(self, builder, line):
     # Avoid duplication
     if self._last_lineno == line:
         return
     self._last_lineno = line
     # Add call to an inline asm to mark line location
     asmty = ir.FunctionType(ir.VoidType(), [])
     asm = ir.InlineAsm(asmty, "// dbg {}".format(line), "",
                        side_effect=True)
     call = builder.call(asm, [])
     md = self._di_location(line)
     call.set_metadata('numba.dbg', md)
示例#8
0
def ptx_fp16_habs(context, builder, sig, args):
    if cuda.runtime.get_version() < (10, 2):
        # CUDA < 10.2 does not support abs.f16. For these versions, we mask
        # off the sign bit to compute abs instead. We determine whether or
        # not to do this based on the runtime version so that our behaviour
        # is consistent with the version of NVVM we're using to go from
        # NVVM IR -> PTX.
        inst = 'and.b16 $0, $1, 0x7FFF;'
    else:
        inst = 'abs.f16 $0, $1;'

    fnty = ir.FunctionType(ir.IntType(16), [ir.IntType(16)])
    asm = ir.InlineAsm(fnty, inst, '=h,h')
    return builder.call(asm, args)
示例#9
0
    def test_inline_rsqrt(self):
        mod = ir.Module(__name__)
        fnty = ir.FunctionType(ir.VoidType(), [ir.PointerType(ir.FloatType())])
        fn = ir.Function(mod, fnty, 'cu_rsqrt')
        bldr = ir.IRBuilder(fn.append_basic_block('entry'))

        rsqrt_approx_fnty = ir.FunctionType(ir.FloatType(), [ir.FloatType()])
        inlineasm = ir.InlineAsm(rsqrt_approx_fnty,
                                 'rsqrt.approx.f32 $0, $1;',
                                 '=f,f',
                                 side_effect=True)
        val = bldr.load(fn.args[0])
        res = bldr.call(inlineasm, [val])

        bldr.store(res, fn.args[0])
        bldr.ret_void()

        # generate ptx
        nvvm.fix_data_layout(mod)
        nvvm.set_cuda_kernel(fn)
        nvvmir = str(mod)
        ptx = nvvm.llvm_to_ptx(nvvmir)
        self.assertTrue('rsqrt.approx.f32' in str(ptx))
示例#10
0
def _generic_array(context,
                   builder,
                   shape,
                   dtype,
                   symbol_name,
                   addrspace,
                   can_dynsized=False):
    elemcount = reduce(operator.mul, shape, 1)

    # Check for valid shape for this type of allocation.
    # Only 1d arrays can be dynamic.
    dynamic_smem = elemcount <= 0 and can_dynsized and len(shape) == 1
    if elemcount <= 0 and not dynamic_smem:
        raise ValueError("array length <= 0")

    # Check that we support the requested dtype
    data_model = context.data_model_manager[dtype]
    other_supported_type = (isinstance(dtype, (types.Record, types.Boolean))
                            or isinstance(data_model, models.StructModel))
    if dtype not in types.number_domain and not other_supported_type:
        raise TypeError("unsupported type: %s" % dtype)

    lldtype = context.get_data_type(dtype)
    laryty = ir.ArrayType(lldtype, elemcount)

    if addrspace == nvvm.ADDRSPACE_LOCAL:
        # Special case local address space allocation to use alloca
        # NVVM is smart enough to only use local memory if no register is
        # available
        dataptr = cgutils.alloca_once(builder, laryty, name=symbol_name)
    else:
        lmod = builder.module

        # Create global variable in the requested address space
        gvmem = cgutils.add_global_variable(lmod, laryty, symbol_name,
                                            addrspace)
        # Specify alignment to avoid misalignment bug
        align = context.get_abi_sizeof(lldtype)
        # Alignment is required to be a power of 2 for shared memory. If it is
        # not a power of 2 (e.g. for a Record array) then round up accordingly.
        gvmem.align = 1 << (align - 1).bit_length()

        if dynamic_smem:
            gvmem.linkage = 'external'
        else:
            ## Comment out the following line to workaround a NVVM bug
            ## which generates a invalid symbol name when the linkage
            ## is internal and in some situation.
            ## See _get_unique_smem_id()
            # gvmem.linkage = lc.LINKAGE_INTERNAL

            gvmem.initializer = ir.Constant(laryty, ir.Undefined)

        # Convert to generic address-space
        conv = nvvmutils.insert_addrspace_conv(lmod, ir.IntType(8), addrspace)
        addrspaceptr = gvmem.bitcast(ir.PointerType(ir.IntType(8), addrspace))
        dataptr = builder.call(conv, [addrspaceptr])

    targetdata = _get_target_data(context)
    lldtype = context.get_data_type(dtype)
    itemsize = lldtype.get_abi_size(targetdata)

    # Compute strides
    laststride = itemsize
    rstrides = []
    for i, lastsize in enumerate(reversed(shape)):
        rstrides.append(laststride)
        laststride *= lastsize
    strides = [s for s in reversed(rstrides)]
    kstrides = [context.get_constant(types.intp, s) for s in strides]

    # Compute shape
    if dynamic_smem:
        # Compute the shape based on the dynamic shared memory configuration.
        # Unfortunately NVVM does not provide an intrinsic for the
        # %dynamic_smem_size register, so we must read it using inline
        # assembly.
        get_dynshared_size = ir.InlineAsm(ir.FunctionType(ir.IntType(32), []),
                                          "mov.u32 $0, %dynamic_smem_size;",
                                          '=r',
                                          side_effect=True)
        dynsmem_size = builder.zext(builder.call(get_dynshared_size, []),
                                    ir.IntType(64))
        # Only 1-D dynamic shared memory is supported so the following is a
        # sufficient construction of the shape
        kitemsize = context.get_constant(types.intp, itemsize)
        kshape = [builder.udiv(dynsmem_size, kitemsize)]
    else:
        kshape = [context.get_constant(types.intp, s) for s in shape]

    # Create array object
    ndim = len(shape)
    aryty = types.Array(dtype=dtype, ndim=ndim, layout='C')
    ary = context.make_array(aryty)(context, builder)

    context.populate_array(ary,
                           data=builder.bitcast(dataptr, ary.data.type),
                           shape=kshape,
                           strides=kstrides,
                           itemsize=context.get_constant(types.intp, itemsize),
                           meminfo=None)
    return ary._getvalue()
示例#11
0
def ptx_hfma(context, builder, sig, args):
    argtys = [ir.IntType(16), ir.IntType(16), ir.IntType(16)]
    fnty = ir.FunctionType(ir.IntType(16), argtys)
    asm = ir.InlineAsm(fnty, "fma.rn.f16 $0,$1,$2,$3;", "=h,h,h,h")
    return builder.call(asm, args)
示例#12
0
def ptx_fp16_hneg(context, builder, sig, args):
    fnty = ir.FunctionType(ir.IntType(16), [ir.IntType(16)])
    asm = ir.InlineAsm(fnty, 'neg.f16 $0, $1;', '=h,h')
    return builder.call(asm, args)
示例#13
0
 def ptx_fp16_binary(context, builder, sig, args):
     fnty = ir.FunctionType(ir.IntType(16),
                            [ir.IntType(16), ir.IntType(16)])
     asm = ir.InlineAsm(fnty, f'{op}.f16 $0,$1,$2;', '=h,h,h')
     return builder.call(asm, args)
示例#14
0
def ptx_lanemask_lt(context, builder, sig, args):
    activemask = ir.InlineAsm(ir.FunctionType(ir.IntType(32), []),
                              "mov.u32 $0, %lanemask_lt;",
                              '=r',
                              side_effect=True)
    return builder.call(activemask, [])
示例#15
0
def ptx_activemask(context, builder, sig, args):
    activemask = ir.InlineAsm(ir.FunctionType(ir.IntType(32), []),
                              "activemask.b32 $0;",
                              '=r',
                              side_effect=True)
    return builder.call(activemask, [])