示例#1
0
 def __init__(self, kernel_name, func_name=None):
   self.func_name = func_name or kernel_name
   self.name = kernel_name + ".cu"
   with open(path.join(kernel_dir, self.name), 'r') as cu_f:
     self.kernel_source = cu_f.read().encode()
   self.prog = Program(self.kernel_source, self.name.encode()) 
   self.cache = {}
示例#2
0
def compile_with_nvrtc(cuda_text):
    c = Program(cuda_text)
    device = deviceinfo()
    cp = str(device.cc_major.value * 10 + device.cc_minor.value)
    ptx = c.compile(['-arch=compute_' + cp])
    with open(PTX_PATH, 'w+') as f:
        f.write(ptx)
示例#3
0
    def __call__(self, input):
        if not self.jit or not isinstance(input, torch.cuda.FloatTensor):
            norm = input.norm(2, input.dim() - 1)
            return torch.cat([norm, norm.new(norm.size()).zero_()], input.dim() - 1)

        out = input.new(input.size())
        input = input.contiguous()

        if not iscomplex(input):
            raise TypeError('The input and outputs should be complex')

        if (self.modulus_cache[input.get_device()] is None):
            kernel = """
            extern "C"
            __global__ void abs_complex_value(const float * x, float2 * z, int n)
            {
                int i = blockIdx.x * blockDim.x + threadIdx.x;
            if (i >= n)
                return;
            z[i] = make_float2(normf(2, x + 2*i), 0);

            }
            """
            print('modulus.cu')
            prog = Program(kernel, 'modulus.cu')
            ptx = prog.compile([('-arch='+get_compute_arch(input))])
            module = Module()
            module.load(ptx.encode())
            self.modulus_cache[input.get_device()] = module
        fabs = self.modulus_cache[input.get_device()].get_function('abs_complex_value')
        fabs(grid=(self.GET_BLOCKS(int(out.nelement())//2), 1, 1),
             block=(self.CUDA_NUM_THREADS, 1, 1),
             args=[input.data_ptr(), out.data_ptr(), out.numel() // 2],
             stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
        return out
示例#4
0
    def __call__(self, points, indices):
        size = points.size()
        if size in self.cached_functions:
            func = self.cached_functions[size]
        else:
            kernel = open('utils/cuda/distance.cu', 'r').read()
            kernel = Template(kernel).substitute(BATCH_SIZE=size[0], NUM_ROWS=size[1])

            program = Program(kernel, 'distance.cu')
            ptx = program.compile()

            m = function.Module()
            m.load(bytes(ptx.encode()))

            func = m.get_function('distance')
            self.cached_functions[size] = func

        indices = indices.contiguous()

        col_ind = indices.new(indices.size(1))
        col_ptr = indices.new(size[0], size[1] + 1)
        col_ptr.fill_(indices.size(1))
        col_ptr.fill_(0)

        grid = ((indices.size(1) + 1024 - 1) // 1024, 1, 1)
        block = (1024, 1, 1)
        func(grid=grid, block=block,
             args=[indices.data_ptr(), col_ind.data_ptr(),
                   col_ptr.data_ptr(), indices.size(1)],
             stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))

        return col_ind, col_ptr
    def __call__(self, input):
        if not self.jit or not isinstance(input, torch.cuda.FloatTensor):
            norm = input.norm(2, input.dim() - 1)
            return torch.cat([norm, norm.new(norm.size()).zero_()], input.dim() - 1)

        out = input.new(input.size())
        input = input.contiguous()

        if not iscomplex(input):
            raise TypeError('The input and outputs should be complex')

        if (self.modulus_cache[input.get_device()] is None):
            kernel = b"""
            extern "C"
            __global__ void abs_complex_value(const float * x, float2 * z, int n)
            {
                int i = blockIdx.x * blockDim.x + threadIdx.x;
            if (i >= n)
                return;
            z[i] = make_float2(normf(2, x + 2*i), 0);

            }
            """
            print('modulus.cu')
            prog = Program(kernel, b'modulus.cu')
            ptx = prog.compile(['-arch='+get_compute_arch(input)])
            module = Module()
            module.load(bytes(ptx.encode()))
            self.modulus_cache[input.get_device()] = module
        fabs = self.modulus_cache[input.get_device()].get_function('abs_complex_value')
        fabs(grid=(self.GET_BLOCKS(int(out.nelement())//2), 1, 1),
             block=(self.CUDA_NUM_THREADS, 1, 1),
             args=[input.data_ptr(), out.data_ptr(), out.numel() // 2],
             stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
        return out
示例#6
0
    def compile(self):
        if self.ptx is None:
            from pynvrtc.compiler import Program
            program = Program(kernel, 'recurrent_forget_mult.cu')
            GPUForgetMult.ptx = program.compile()

        if torch.cuda.current_device() not in GPUForgetMult.configured_gpus:
            from cupy.cuda import function
            m = function.Module()
            m.load(bytes(self.ptx.encode()))

            # Forward ForgetMult
            self.forget_mult = m.get_function('recurrent_forget_mult')
            self.forget_mult_bwd = m.get_function('recurrent_forget_mult_bwd')

            # Backward ForgetMult
            self.bwd_forget_mult = m.get_function('recurrent_bwd_forget_mult')
            self.bwd_forget_mult_bwd = m.get_function(
                'recurrent_bwd_forget_mult_bwd')

            Stream = namedtuple('Stream', ['ptr'])
            self.stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)

            GPUForgetMult.configured_gpus[torch.cuda.current_device()] = (
                self.forget_mult, self.forget_mult_bwd, self.bwd_forget_mult,
                self.bwd_forget_mult_bwd, self.stream)

        self.forget_mult, self.forget_mult_bwd, self.bwd_forget_mult, self.bwd_forget_mult_bwd, self.stream = GPUForgetMult.configured_gpus[
            torch.cuda.current_device()]
示例#7
0
文件: cuda.py 项目: zyc2sjtu/s2cnn
def compile_kernel(kernel, filename, functioname):
    program = Program(bytes(kernel, 'ascii'), filename)
    ptx = program.compile()

    m = function.Module()
    m.load(bytes(ptx.encode()))

    f = m.get_function(functioname)
    return f
示例#8
0
 def __init__(self, kernel_prog, ):
   self.func_name = func_name or kernel_name
   self.name = kernel_name + ".cu"
   with open(path.join(kernel_dir, self.name), 'r') as cu_f:
     self.kernel_source = cu_f.read().encode()
   self.prog = Program(self.kernel_source, self.name.encode())
   ptx = self.prog.compile([self.get_compute_arch_arg(device_id)])
   self.module = Module()
   self.module.load(ptx.encode())
示例#9
0
def _initial_cupy(name="indices_slice"):
    program = Program(kernel.encode(), (name + '.cu').encode())
    ptx = program.compile()

    m = function.Module()
    m.load(bytes(ptx.encode()))

    kernel_function = m.get_function(name)

    Stream = namedtuple('Stream', ['ptr'])
    s = Stream(ptr=torch.cuda.current_stream().cuda_stream)
    return kernel_function, s
示例#10
0
    def backward(ctx, grad_h):
        ###################
        if ctx.ptx is None:
            program = Program(kernel.encode(),
                              'recurrent_forget_mult.cu'.encode())

            GPUForgetMult.ptx = program.compile()

        if torch.cuda.current_device() not in GPUForgetMult.configured_gpus:
            m = function.Module()
            m.load(bytes(ctx.ptx.encode()))

            ctx.forget_mult = m.get_function('recurrent_forget_mult')
            ctx.bwd_forget_mult = m.get_function('bwd_recurrent_forget_mult')

            Stream = namedtuple('Stream', ['ptr'])
            ctx.stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)

            GPUForgetMult.configured_gpus[torch.cuda.current_device()] = (
                ctx.forget_mult, ctx.bwd_forget_mult, ctx.stream)

        ctx.forget_mult, ctx.bwd_forget_mult, ctx.stream = GPUForgetMult.configured_gpus[
            torch.cuda.current_device()]
        #################
        f, x, hidden_init = ctx.saved_tensors
        h = ctx.result
        ###
        seq_size, batch_size, hidden_size = f.size()
        # Zeroing is not necessary as these will be overwritten
        grad_f = f.new(*f.size())
        grad_x = f.new(*f.size())
        grad_h_init = f.new(batch_size, hidden_size)
        ###
        grid_hidden_size = min(hidden_size, 512)
        grid = (math.ceil(hidden_size / grid_hidden_size), batch_size)
        ctx.bwd_forget_mult(grid=grid,
                            block=(grid_hidden_size, 1),
                            args=[
                                h.data_ptr(),
                                f.data_ptr(),
                                x.data_ptr(),
                                grad_h.data_ptr(),
                                grad_f.data_ptr(),
                                grad_x.data_ptr(),
                                grad_h_init.data_ptr(), seq_size, batch_size,
                                hidden_size
                            ],
                            stream=ctx.stream)
        ###
        if hidden_init is not None:
            return grad_f, grad_x, grad_h_init
        return grad_f, grad_x
示例#11
0
    def compile():
        program = Program(kernel.encode(), 'recurrent_forget_mult.cu'.encode())
        ptx = program.compile()

        m = function.Module()
        m.load(bytes(ptx.encode()))

        GPUForgetMult.forget_mult = m.get_function('recurrent_forget_mult')
        GPUForgetMult.bwd_forget_mult = m.get_function(
            'bwd_recurrent_forget_mult')

        Stream = namedtuple('Stream', ['ptr'])
        GPUForgetMult.stream = Stream(
            ptr=torch.cuda.current_stream().cuda_stream)
示例#12
0
    def compile(self):
        # Create program
        program = Program(self.kernel, self.title)

        # Compile program
        arch = "-arch={0}".format(cupyKernel.get_compute_arch())
        ptx = program.compile([arch])

        # Load Program
        m = function.Module()
        m.load(bytes(ptx.encode()))

        # Get Function Pointer
        self.func = m.get_function(self.func_name)
        self.compiled = True
示例#13
0
def smooth_local_affine(output_cpu, input_cpu, epsilon, patch, h, w, f_r, f_e):
    # program = Program(src.encode('utf-8'), 'best_local_affine_kernel.cu'.encode('utf-8'))
    # ptx = program.compile(['-I/usr/local/cuda/include'.encode('utf-8')])
    program = Program(src, 'best_local_affine_kernel.cu')
    ptx = program.compile(['-I/home/jsy/software/cuda/cuda-10.1/include'])
    # ptx = program.compile(['-I/usr/local/cuda/include'])
    m = function.Module()
    m.load(bytes(ptx.encode()))

    _reconstruction_best_kernel = m.get_function('reconstruction_best_kernel')
    _bilateral_smooth_kernel = m.get_function('bilateral_smooth_kernel')
    _best_local_affine_kernel = m.get_function('best_local_affine_kernel')
    Stream = namedtuple('Stream', ['ptr'])
    s = Stream(ptr=torch.cuda.current_stream().cuda_stream)

    filter_radius = f_r
    sigma1 = filter_radius / 3
    sigma2 = f_e
    radius = (patch - 1) / 2

    filtered_best_output = torch.zeros(np.shape(input_cpu)).cuda()
    affine_model = torch.zeros((h * w, 12)).cuda()
    filtered_affine_model = torch.zeros((h * w, 12)).cuda()

    input_ = torch.from_numpy(input_cpu).cuda()
    output_ = torch.from_numpy(output_cpu).cuda()
    _best_local_affine_kernel(
        grid=(int((h * w) / 256 + 1), 1),
        block=(256, 1, 1),
        args=[output_.data_ptr(), input_.data_ptr(), affine_model.data_ptr(),
              np.int32(h), np.int32(w), np.float32(epsilon), np.int32(radius)], stream=s
    )

    _bilateral_smooth_kernel(
        grid=(int((h * w) / 256 + 1), 1),
        block=(256, 1, 1),
        args=[affine_model.data_ptr(), filtered_affine_model.data_ptr(), input_.data_ptr(), np.int32(h), np.int32(w),
              np.int32(f_r), np.float32(sigma1), np.float32(sigma2)], stream=s
    )

    _reconstruction_best_kernel(
        grid=(int((h * w) / 256 + 1), 1),
        block=(256, 1, 1),
        args=[input_.data_ptr(), filtered_affine_model.data_ptr(), filtered_best_output.data_ptr(),
              np.int32(h), np.int32(w)], stream=s
    )
    numpy_filtered_best_output = filtered_best_output.cpu().numpy()
    return numpy_filtered_best_output
示例#14
0
class SingleDeviceKernel():
  def __init__(self, kernel_prog, ):
    self.func_name = func_name or kernel_name
    self.name = kernel_name + ".cu"
    with open(path.join(kernel_dir, self.name), 'r') as cu_f:
      self.kernel_source = cu_f.read().encode()
    self.prog = Program(self.kernel_source, self.name.encode())
    ptx = self.prog.compile([self.get_compute_arch_arg(device_id)])
    self.module = Module()
    self.module.load(ptx.encode())

  def prep_args(self, kwargs):
    args = []
    for k, v in kwargs.items():
      try:
        args.append(v.data_ptr())
      except:
        args.append(v)
    return args

  def linear_launch(num_threads, *args):
    kernel_func = self.module.get_function(self.func_name)
    kernel_func.linear_launch(
      num_threads,
      args = self.prep_args(args),
      stream=Stream(
        ptr = torch.cuda.current_stream().cuda_stream
      )
    )   
示例#15
0
def get_kernel_func(kname, ksrc, dtype):
    if kname + dtype not in modules:
        ksrc = ksrc.replace('DTYPE', dtype)
        prog = Program(ksrc, kname + dtype + '.cu')
        ptx = prog.compile()
        log = prog._interface.nvrtcGetProgramLog(prog._program)
        if len(log.strip()) > 0: print(log)
        module = cupy.cuda.function.Module()
        module.load(bytes(ptx.encode()))
        modules[kname + dtype] = module
    else:
        module = modules[kname + dtype]

    Stream = namedtuple('Stream', ['ptr'])
    s = Stream(ptr=torch.cuda.current_stream().cuda_stream)

    return module.get_function(kname), s
示例#16
0
    def forward(ctx, f, x, hidden_init=None):
        ctx.ptx = GPUForgetMult.ptx
        ctx.configured_gpus = GPUForgetMult.configured_gpus
        # the self.compile function
        ##################
        if ctx.ptx is None:
            program = Program(kernel.encode(),
                              'recurrent_forget_mult.cu'.encode())
            GPUForgetMult.ptx = program.compile()
            ctx.ptx = GPUForgetMult.ptx

        if torch.cuda.current_device() not in GPUForgetMult.configured_gpus:
            m = function.Module()
            m.load(bytes(ctx.ptx.encode()))

            ctx.forget_mult = m.get_function('recurrent_forget_mult')
            ctx.bwd_forget_mult = m.get_function('bwd_recurrent_forget_mult')

            Stream = namedtuple('Stream', ['ptr'])
            ctx.stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)
            GPUForgetMult.configured_gpus[torch.cuda.current_device()] = (
                ctx.forget_mult, ctx.bwd_forget_mult, ctx.stream)

        ctx.forget_mult, ctx.bwd_forget_mult, ctx.stream = GPUForgetMult.configured_gpus[
            torch.cuda.current_device()]
        ###################
        seq_size, batch_size, hidden_size = f.size()
        result = f.new(seq_size + 1, batch_size, hidden_size)
        # We only zero the result array (result[0]) if we don't set a hidden initial state
        # All other values (result[1:]) are overwritten by default
        if hidden_init is not None: result[0, :, :] = hidden_init
        else: result = result.zero_()
        ###
        grid_hidden_size = min(hidden_size, 512)
        grid = (math.ceil(hidden_size / grid_hidden_size), batch_size)
        ctx.forget_mult(grid=grid,
                        block=(grid_hidden_size, 1),
                        args=[
                            result.data_ptr(),
                            f.data_ptr(),
                            x.data_ptr(), seq_size, batch_size, hidden_size
                        ],
                        stream=ctx.stream)
        ctx.save_for_backward(f, x, hidden_init)
        ctx.result = result
        return result[1:, :, :]
示例#17
0
def compile_cuda( cuda_file ):
    with open( cuda_file, 'rb' ) as f:
        src = f.read()
    from pynvrtc.compiler import Program
    prog = Program( src.decode(), cuda_file )
    ptx  = prog.compile( [
        '-use_fast_math', 
        '-lineinfo',
        '-default-device',
        '-std=c++11',
        '-rdc',
        'true',
        #'-IC:\\ProgramData\\NVIDIA Corporation\OptiX SDK 7.2.0\include',
        #'-IC:\\Program Files\\NVIDIA GPU Computing Toolkit\CUDA\\v11.1\include'
        '-I/usr/local/cuda/include',
        '-I/home/kmorley/Code/support/NVIDIA-OptiX-SDK-7.2.0-linux64-x86_64/include/'
        ] )
    return ptx
示例#18
0
    def compile(cls):
        """Compiles forward and backward GPU kernels for uni- and bi-directional
        SRU. Assumes there is only one GPU.
        """
        if cls._STREAM is not None:
            return

        prog = Program(SRU_CODE.encode(), 'sru_prog.cu'.encode())
        ptx = prog.compile()
        mod = function.Module()
        mod.load(bytes(ptx.encode()))
        cls._FWD_FUNC = mod.get_function('sru_fwd')
        cls._BWD_FUNC = mod.get_function('sru_bwd')
        cls._BiFWD_FUNC = mod.get_function('sru_bi_fwd')
        cls._BiBWD_FUNC = mod.get_function('sru_bi_bwd')

        Stream = namedtuple('Stream', ['ptr'])
        cls._STREAM = Stream(ptr=torch.cuda.current_stream().cuda_stream)
示例#19
0
    def compile(self):
        if self.ptx is None:
            program = Program(kernel.encode(), 'recurrent_forget_mult.cu'.encode())
            GPUForgetMult.ptx = program.compile()

        if torch.cuda.current_device() not in GPUForgetMult.configured_gpus:
            m = function.Module()
            m.load(bytes(self.ptx.encode()))

            self.forget_mult = m.get_function('recurrent_forget_mult')
            self.bwd_forget_mult = m.get_function('bwd_recurrent_forget_mult')

            Stream = namedtuple('Stream', ['ptr'])
            self.stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)

            GPUForgetMult.configured_gpus[torch.cuda.current_device()] = (self.forget_mult, self.bwd_forget_mult, self.stream)

        self.forget_mult, self.bwd_forget_mult, self.stream = GPUForgetMult.configured_gpus[torch.cuda.current_device()]
示例#20
0
    def compile(self):
        if self.ptx is None:
            program = Program(kernel.encode(), 'recurrent_forget_mult.cu'.encode())
            GPUReverseForgetMult.ptx = program.compile()

        if torch.cuda.current_device() not in GPUReverseForgetMult.configured_gpus:
            m = function.Module()
            m.load(bytes(self.ptx.encode()))

            self.forget_mult = m.get_function('recurrent_reverse_forget_mult')
            self.bwd_forget_mult = m.get_function('bwd_recurrent_reverse_forget_mult')

            Stream = namedtuple('Stream', ['ptr'])
            self.stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)

            GPUReverseForgetMult.configured_gpus[torch.cuda.current_device()] = (self.forget_mult, self.bwd_forget_mult, self.stream)

        self.forget_mult, self.bwd_forget_mult, self.stream = GPUReverseForgetMult.configured_gpus[torch.cuda.current_device()]
示例#21
0
def smooth_local_affine(output_cpu, input_cpu, epsilon, patch, h, w, f_r, f_e):
  program = Program(src.encode('utf-8'), 'best_local_affine_kernel.cu'.encode('utf-8'))
  ptx = program.compile(['-I/usr/local/cuda-9.1/include'.encode('utf-8')])
  m = function.Module()
  m.load(bytes(ptx.encode()))

  _reconstruction_best_kernel = m.get_function('reconstruction_best_kernel')
  _bilateral_smooth_kernel = m.get_function('bilateral_smooth_kernel')
  _best_local_affine_kernel = m.get_function('best_local_affine_kernel')
  Stream = namedtuple('Stream', ['ptr'])
  s = Stream(ptr=torch.cuda.current_stream().cuda_stream)

  filter_radius = f_r
  sigma1 = filter_radius / 3
  sigma2 = f_e
  radius = (patch - 1) / 2

  filtered_best_output = torch.zeros(np.shape(input_cpu)).cuda()
  affine_model =  torch.zeros((h * w, 12)).cuda()
  filtered_affine_model =torch.zeros((h * w, 12)).cuda()

  input_ = torch.from_numpy(input_cpu).cuda()
  output_ = torch.from_numpy(output_cpu).cuda()
  _best_local_affine_kernel(
    grid=(int((h * w) / 256 + 1), 1),
    block=(256, 1, 1),
    args=[output_.data_ptr(), input_.data_ptr(), affine_model.data_ptr(),
       np.int32(h), np.int32(w), np.float32(epsilon), np.int32(radius)], stream=s
   )

  _bilateral_smooth_kernel(
    grid=(int((h * w) / 256 + 1), 1),
    block=(256, 1, 1),
    args=[affine_model.data_ptr(), filtered_affine_model.data_ptr(), input_.data_ptr(), np.int32(h), np.int32(w), np.int32(f_r), np.float32(sigma1), np.float32(sigma2)], stream=s
  )

  _reconstruction_best_kernel(
    grid=(int((h * w) / 256 + 1), 1),
    block=(256, 1, 1),
    args=[input_.data_ptr(), filtered_affine_model.data_ptr(), filtered_best_output.data_ptr(),
    np.int32(h), np.int32(w)], stream=s
  )
  numpy_filtered_best_output = filtered_best_output.cpu().numpy()
  return numpy_filtered_best_output
示例#22
0
def get_kernel_func(kname, ksrc, dtype):
    if kname + dtype not in modules:
        ksrc = ksrc.replace('DTYPE', dtype)
        #prog = Program(ksrc.encode('utf-8'), (kname+dtype+'.cu').encode('utf-8'))
        #uncomment the line above and comment the line below if it causes the following error: AttributeError: 'Program' object has no attribute '_program'
        prog = Program(ksrc, kname + dtype + '.cu')
        ptx = prog.compile()
        log = prog._interface.nvrtcGetProgramLog(prog._program)
        if len(log.strip()) > 0: print(log)
        module = cupy.cuda.function.Module()
        module.load(bytes(ptx.encode()))
        modules[kname + dtype] = module
    else:
        module = modules[kname + dtype]

    Stream = namedtuple('Stream', ['ptr'])
    s = Stream(ptr=torch.cuda.current_stream().cuda_stream)

    return module.get_function(kname), s
示例#23
0
    def compile(self):
        if GPULayerNorm.ptx is None:
            program = Program(kernel.encode(), 'layer_norm.cu'.encode())
            GPULayerNorm.ptx = program.compile()

        if torch.cuda.current_device() not in GPULayerNorm.configured_gpus:
            m = function.Module()
            m.load(bytes(GPULayerNorm.ptx.encode()))

            self.layer_norm = m.get_function('layer_norm')
            self.bwd_layer_norm = m.get_function('bwd_layer_norm')
            self.bwd_mults = m.get_function('bwd_multiplications')

            Stream = namedtuple('Stream', ['ptr'])
            self.stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)

            GPULayerNorm.configured_gpus[torch.cuda.current_device()] = (self.layer_norm, self.bwd_layer_norm, self.bwd_mults, self.stream)

        self.layer_norm, self.bwd_layer_norm, self.bwd_mults, self.stream = GPULayerNorm.configured_gpus[torch.cuda.current_device()]
示例#24
0
    def compile():
        if GPUForgetMult.ptx is None:
            program = Program(kernel, 'recurrent_forget_mult.cu')
            GPUForgetMult.ptx = program.compile()

        if torch.cuda.current_device() not in GPUForgetMult.configured_gpus:
            m = function.Module()
            m.load(bytes(GPUForgetMult.ptx.encode()))

            forget_mult = m.get_function('recurrent_forget_mult')
            bwd_forget_mult = m.get_function('bwd_recurrent_forget_mult')

            Stream = namedtuple('Stream', ['ptr'])
            stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)

            GPUForgetMult.configured_gpus[torch.cuda.current_device()] = (
                forget_mult, bwd_forget_mult, stream)

        return GPUForgetMult.configured_gpus[torch.cuda.current_device()]
示例#25
0
    def compile(self):
        if self.ptx is None:
            program = Program(kernel, 'recurrent_forget_mult.cu', lib_name="C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v10.1\\bin\\nvrtc64_101_0.dll")
            GPUForgetMult.ptx = program.compile()

        if torch.cuda.current_device() not in GPUForgetMult.configured_gpus:
            m = function.Module()
            m.load(bytes(self.ptx.encode()))

            self.forget_mult = m.get_function('recurrent_forget_mult')
            self.bwd_forget_mult = m.get_function('bwd_recurrent_forget_mult')

            Stream = namedtuple('Stream', ['ptr'])
            self.stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)

            GPUForgetMult.configured_gpus[torch.cuda.current_device()] = (
            self.forget_mult, self.bwd_forget_mult, self.stream)

        self.forget_mult, self.bwd_forget_mult, self.stream = GPUForgetMult.configured_gpus[torch.cuda.current_device()]
示例#26
0
    def __call__(self, values, col_ind, col_ptr, size, dense):
        func_id = (size[0], size[1], size[2], dense.size(1), dense.size(2))
        if func_id in self.cached_functions:
            func = self.cached_functions[func_id]
        else:
            kernel = open('utils/cuda/sparse_bmm.cu', 'r').read()
            kernel = Template(kernel).substitute(BATCH_SIZE=size[0],
                                                 SPARSE_NUM_ROWS=size[1],
                                                 SPARSE_NUM_COLS=size[2],
                                                 DENSE_NUM_ROWS=dense.size(1),
                                                 DENSE_NUM_COLS=dense.size(2))

            program = Program(kernel, 'sparse_bmm.cu')
            ptx = program.compile()

            m = function.Module()
            m.load(bytes(ptx.encode()))

            func = m.get_function('sparse_bmm')
            self.cached_functions[func_id] = func

        values = values.contiguous()
        col_ind = col_ind.contiguous()
        col_ptr = col_ptr.contiguous()
        dense = dense.contiguous()
        result = values.new(size[0], size[1], dense.size(2))
        block = (8, 8, 8)

        grid = tuple([(result.size(i) + block[i] - 1) // block[i]
                      for i in range(3)])
        func(grid=grid,
             block=block,
             args=[
                 result.data_ptr(),
                 values.data_ptr(),
                 col_ind.data_ptr(),
                 col_ptr.data_ptr(),
                 dense.data_ptr()
             ],
             stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))

        return result
示例#27
0
    def compile(self):
        if self.ptx is None:
            program = Program(kernel, 'relu.cu')
            GPUReLUF.ptx = program.compile()

        if torch.cuda.current_device() not in GPUReLUF.configured_gpus:
            m = function.Module()
            m.load(bytes(self.ptx))

            self.relu_forward = m.get_function('relu_forward')
            self.relu_backward = m.get_function('relu_backward')

            Stream = namedtuple('Stream', ['ptr'])
            self.stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)

            GPUReLUF.configured_gpus[torch.cuda.current_device()] = (
                self.relu_forward, self.relu_backward, self.stream)

        self.relu_forward, self.relu_backward, self.stream = GPUReLUF.configured_gpus[
            torch.cuda.current_device()]
示例#28
0
    def compile(self):
        if self.ptx is None:
            #program = Program(kernel.encode(), 'recurrent_forget_mult.cu'.encode())
            # https://github.com/salesforce/pytorch-qrnn/issues/26
            program = Program(kernel, 'recurrent_forget_mult.cu')
            GPUForgetMult.ptx = program.compile()

        if torch.cuda.current_device() not in GPUForgetMult.configured_gpus:
            m = function.Module()
            m.load(bytes(self.ptx.encode()))

            self.forget_mult = m.get_function('recurrent_forget_mult')
            self.bwd_forget_mult = m.get_function('bwd_recurrent_forget_mult')

            Stream = namedtuple('Stream', ['ptr'])
            self.stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)

            GPUForgetMult.configured_gpus[torch.cuda.current_device()] = (
                self.forget_mult, self.bwd_forget_mult, self.stream)

        self.forget_mult, self.bwd_forget_mult, self.stream = GPUForgetMult.configured_gpus[
            torch.cuda.current_device()]
示例#29
0
def _build_cuda_kernels():
    global _apply_disparity_func_pos
    global _apply_disparity_func_neg

    _apply_disparity_pos_kernel = '''
    extern "C" {
        __global__ void apply_disparity_pos(
        float *dst, const float *src, const int *disp, int h, int w, int c, int total_l) {
            int i = blockIdx.x * blockDim.x + threadIdx.x;
            if (i >= total_l)
                return;
            int dbase = (i/h/c*h+i%h)*w;
            for (int j = w - 1; j >=0; j--) {
                int idx = j + disp[dbase+j];
                if (idx < w)
                    dst[i*w+idx] = src[i*w+j];
            }
        }
        __global__ void apply_disparity_neg(
        float *dst, const float *src, const int *disp, int h, int w, int c, int total_l) {
            int i = blockIdx.x * blockDim.x + threadIdx.x;
            if (i >= total_l)
                return;
            int dbase = (i/h/c*h+i%h)*w;
            for (int j = 0; j < w; j++) {
                int idx = j + disp[dbase+j];
                if (idx > -1)
                    dst[i*w+idx] = src[i*w+j];
            }
        }
    }
    '''
    program = Program(
        _apply_disparity_pos_kernel, 'apply_disparity.cu')
    m = function.Module()
    m.load(bytes(program.compile().encode()))
    _apply_disparity_func_pos = m.get_function('apply_disparity_pos')
    _apply_disparity_func_neg = m.get_function('apply_disparity_neg')
示例#30
0
def load_sru_mod():
    global SRU_FWD_FUNC, SRU_BWD_FUNC, SRU_BiFWD_FUNC, SRU_BiBWD_FUNC
    global SRU_STREAM
    if check_sru_requirement():
        from cupy.cuda import function
        from pynvrtc.compiler import Program

        # This sets up device to use.
        device = torch.device("cuda")
        tmp_ = torch.rand(1, 1).to(device)

        sru_prog = Program(SRU_CODE.encode('utf-8'),
                           'sru_prog.cu'.encode('utf-8'))
        sru_ptx = sru_prog.compile()
        sru_mod = function.Module()
        sru_mod.load(bytes(sru_ptx.encode()))

        SRU_FWD_FUNC = sru_mod.get_function('sru_fwd')
        SRU_BWD_FUNC = sru_mod.get_function('sru_bwd')
        SRU_BiFWD_FUNC = sru_mod.get_function('sru_bi_fwd')
        SRU_BiBWD_FUNC = sru_mod.get_function('sru_bi_bwd')

        stream = namedtuple('Stream', ['ptr'])
        SRU_STREAM = stream(ptr=torch.cuda.current_stream().cuda_stream)
示例#31
0
文件: sru.py 项目: Unbabel/OpenNMT-py
def load_sru_mod():
    global SRU_FWD_FUNC, SRU_BWD_FUNC, SRU_BiFWD_FUNC, SRU_BiBWD_FUNC
    global SRU_STREAM
    if check_sru_requirement():
        from cupy.cuda import function
        from pynvrtc.compiler import Program

        # This sets up device to use.
        device = torch.device("cuda")
        tmp_ = torch.rand(1, 1).to(device)

        sru_prog = Program(SRU_CODE.encode('utf-8'),
                           'sru_prog.cu'.encode('utf-8'))
        sru_ptx = sru_prog.compile()
        sru_mod = function.Module()
        sru_mod.load(bytes(sru_ptx.encode()))

        SRU_FWD_FUNC = sru_mod.get_function('sru_fwd')
        SRU_BWD_FUNC = sru_mod.get_function('sru_bwd')
        SRU_BiFWD_FUNC = sru_mod.get_function('sru_bi_fwd')
        SRU_BiBWD_FUNC = sru_mod.get_function('sru_bi_bwd')

        stream = namedtuple('Stream', ['ptr'])
        SRU_STREAM = stream(ptr=torch.cuda.current_stream().cuda_stream)
示例#32
0
class KernelCache():
  def __init__(self, kernel_name, func_name=None):
    self.func_name = func_name or kernel_name
    self.name = kernel_name + ".cu"
    with open(path.join(kernel_dir, self.name), 'r') as cu_f:
      self.kernel_source = cu_f.read().encode()
    self.prog = Program(self.kernel_source, self.name.encode()) 
    self.cache = {}

  def cached(self, device_id):
    try:
      kernel_func = self.cache[device_id].get_function(self.func_name)
    except KeyError:
      self.cache[device_id] = self.compile_and_prep_kernel(device_id)
      kernel_func = self.cache[device_id].get_function(self.func_name)
    return kernel_func

  def compile_and_prep_kernel(self, device_id):
    ptx = self.prog.compile([self.get_compute_arch_arg(device_id)])
    module = Module()
    module.load(ptx.encode())
    return module

  def get_compute_arch_arg(self, device_id):
    return "-arch=compute_{0}".format(
      device.Device(device_id).compute_capability\
    ).encode()

  def prep_args(self, kwargs):
    args = []
    for k, v in kwargs.items():
      try:
        args.append(v.data_ptr())
      except:
        args.append(v)
    return args
示例#33
0
class UnICORNN_compile():
    _UnICORNN_PROG = Program(UnICORNN_CODE, 'unicornn_prog.cu')
    _UnICORNN_PTX = _UnICORNN_PROG.compile()
    _DEVICE2FUNC = {}

    def __init__(self):
        super(UnICORNN_compile, self).__init__()

    def compile_functions(self):
        device = torch.cuda.current_device()
        mod = function.Module()
        mod.load(bytes(self._UnICORNN_PTX.encode()))
        fwd_func = mod.get_function('unicornn_fwd')
        bwd_func = mod.get_function('unicornn_bwd')

        Stream = namedtuple('Stream', ['ptr'])
        current_stream = Stream(ptr=torch.cuda.current_stream().cuda_stream)

        self._DEVICE2FUNC[device] = (current_stream, fwd_func, bwd_func)
        return current_stream, fwd_func, bwd_func

    def get_functions(self):
        res = self._DEVICE2FUNC.get(torch.cuda.current_device(), None)
        return res if res else self.compile_functions()
示例#34
0
        *(grad_bias + col) = gbias1;
        *(grad_bias + col + ncols) = gbias2;
        *(grad_init +col) = cur;
    }
}
"""


if check_sru_requirement():
    from cupy.cuda import function
    from pynvrtc.compiler import Program

    # This cuda() is important, it sets up device to use.
    tmp_ = torch.rand(1, 1).cuda()

    sru_prog = Program(SRU_CODE.encode('utf-8'),
                       'sru_prog.cu'.encode('utf-8'))
    sru_ptx = sru_prog.compile()
    sru_mod = function.Module()
    sru_mod.load(bytes(sru_ptx.encode()))

    SRU_FWD_FUNC = sru_mod.get_function('sru_fwd')
    SRU_BWD_FUNC = sru_mod.get_function('sru_bwd')
    SRU_BiFWD_FUNC = sru_mod.get_function('sru_bi_fwd')
    SRU_BiBWD_FUNC = sru_mod.get_function('sru_bi_bwd')

    stream = namedtuple('Stream', ['ptr'])
    SRU_STREAM = stream(ptr=torch.cuda.current_stream().cuda_stream)


class SRU_Compute(Function):
    def __call__(self, input, k):
        out = input.new(input.size(0), input.size(1), input.size(2) // k, input.size(3) // k, 2)

        if not self.jit or isinstance(input, (torch.FloatTensor, torch.DoubleTensor)):
            y = input.view(input.size(0), input.size(1),
                           input.size(2)//out.size(2), out.size(2),
                           input.size(3)//out.size(3), out.size(3),
                           2)
            out = y.mean(4).squeeze(4).mean(2).squeeze(2)
            return out

        if not iscomplex(input):
            raise (TypeError('The input and outputs should be complex'))

        input = input.contiguous()

        if (self.periodize_cache[(input.size(), out.size(), input.get_device())] is None):
            kernel = '''
            #define NW ${W} / ${k}
            #define NH ${H} / ${k}
            extern "C"
            __global__ void periodize(const ${Dtype}2 *input, ${Dtype}2 *output)
            {
              int tx = blockIdx.x * blockDim.x + threadIdx.x;
              int ty = blockIdx.y * blockDim.y + threadIdx.y;
              int tz = blockIdx.z * blockDim.z + threadIdx.z;
              if(tx >= NW || ty >= NH || tz >= ${B})
                return;
              input += tz * ${H} * ${W} + ty * ${W} + tx;
              ${Dtype}2 res = make_${Dtype}2(0.f, 0.f);
              for (int j=0; j<${k}; ++j)
                for (int i=0; i<${k}; ++i)
                {
                  const ${Dtype}2 &c = input[j * NH * ${W} + i * NW];
                  res.x += c.x;
                  res.y += c.y;
                }
              res.x /= ${k} * ${k};
              res.y /= ${k} * ${k};
              output[tz * NH * NW + ty * NW + tx] = res;
            }
            '''
            B = input.nelement() // (2*input.size(-2) * input.size(-3))
            W = input.size(-2)
            H = input.size(-3)
            k = input.size(-2) // out.size(-2)
            kernel = Template(kernel).substitute(B=B, H=H, W=W, k=k, Dtype=getDtype(input))
            name = str(input.get_device())+'-'+str(B)+'-'+str(k)+'-'+str(H)+'-'+str(W)+'-periodize.cu'
            print(name)
            prog = Program(kernel, name.encode())
            ptx = prog.compile(['-arch='+get_compute_arch(input)])
            module = Module()
            module.load(bytes(ptx.encode()))
            self.periodize_cache[(input.size(), out.size(), input.get_device())] = module
        grid = (self.GET_BLOCKS(out.size(-3), self.block[0]),
                self.GET_BLOCKS(out.size(-2), self.block[1]),
                self.GET_BLOCKS(out.nelement() // (2*out.size(-2) * out.size(-3)), self.block[2]))
        periodize = self.periodize_cache[(input.size(), out.size(), input.get_device())].get_function('periodize')
        periodize(grid=grid, block=self.block, args=[input.data_ptr(), out.data_ptr()],
                  stream=Stream(ptr=torch.cuda.current_stream().cuda_stream))
        return out
示例#36
0
        *(grad_bias + col) = gbias1;
        *(grad_bias + col + ncols) = gbias2;
        *(grad_init +col) = cur;
    }
}
"""


if check_sru_requirement():
    from cupy.cuda import function
    from pynvrtc.compiler import Program

    # This cuda() is important, it sets up device to use.
    tmp_ = torch.rand(1, 1).cuda()

    sru_prog = Program(SRU_CODE.encode('utf-8'),
                       'sru_prog.cu'.encode('utf-8'))
    sru_ptx = sru_prog.compile()
    sru_mod = function.Module()
    sru_mod.load(bytes(sru_ptx.encode()))

    SRU_FWD_FUNC = sru_mod.get_function('sru_fwd')
    SRU_BWD_FUNC = sru_mod.get_function('sru_bwd')
    SRU_BiFWD_FUNC = sru_mod.get_function('sru_bi_fwd')
    SRU_BiBWD_FUNC = sru_mod.get_function('sru_bi_bwd')

    stream = namedtuple('Stream', ['ptr'])
    SRU_STREAM = stream(ptr=torch.cuda.current_stream().cuda_stream)


class SRU_Compute(Function):
示例#37
0
文件: ptxgen.py 项目: NVIDIA/pynvrtc
    sys.exit(1)

try:
    src = None
    options = []

    # Parse all options
    for a in sys.argv[1:]:
        if a.startswith('-'):
            # Treat as compiler option
            options.append(a)
        else:
            # Treat as compiler input
            with open(a, 'rb') as f:
                src = f.read()

    # Create program object
    p = Program(src)

    # Run the compile
    ptx = p.compile(options)

    # Dump the output to stdout
    print(ptx)

    sys.exit(0)

except ProgramException as e:
    # An error occurred, dump it to stdout
    print('ERROR:\n%s\n' % repr(e))