def calcV(I_shape, I_cu, V_cu):
    #Ifull = I
    Ci = I_shape[0]
    iH = I_shape[1]
    iW = I_shape[2]
    N = I_shape[3]
    tiles = iW // 4

    oH = iH
    oW = iW
    padH = 1
    padW = 1

    # adapted from winograd_conv.py
    #if N == 1:
    #    shlN = 0
    #elif N < 32:
    #    shlN = len(bin(N-1))-2
    #else:
    #    shlN = 5
    shlN = 5
    shlY, shlX, maskY, shrY, maskX, shrX, maskN, supY, supX = {
        0 : (4, 5, 0x18, 3, 0x07, 0, 0x00, 0x203, 0x300), # 4x8  yyxxx
        1 : (4, 4, 0x18, 3, 0x06, 1, 0x01, 0x203, 0x201), # 4x4  yyxxn
        2 : (3, 4, 0x10, 4, 0x0c, 2, 0x03, 0x104, 0x202), # 2x4  yxxnn
        3 : (2, 4, 0x00, 0, 0x18, 3, 0x07, 0x000, 0x203), # 1x4  xxnnn
        4 : (2, 3, 0x00, 0, 0x10, 4, 0x0f, 0x000, 0x104), # 1x2  xnnnn
        5 : (2, 2, 0x00, 0, 0x00, 0, 0x1f, 0x000, 0x000), # 1x1  nnnnn
    }.get(shlN)

    GYS  = ceil_div(oH, 1 << shlY)
    GXS  = ceil_div(oW, 1 << shlX)
    GN   = ceil_div(N, 1 << shlN)
    # GK   = ceil_div(Co, 32)
    GYS2 = GYS // 2
    GXS2 = GXS  * 2

    div_GXS2 = get_div_mul_shift_32(GXS * GYS, GXS2)
    div_GXS = get_div_mul_shift_32(GXS * GYS, GXS)

    image_size = 1152*Ci*GXS*GYS*GN
    
    print('div_GXS', div_GXS)

    print('GYS', GYS, 'GXS', GXS, 'GN', GN, 'Ci', Ci, 'GY_GX', GXS * GYS)
    grid = (GN, GYS*GXS, Ci)
    block = (32, 1, 1)

    call_cu_kernel(
        k_calcV,
        grid, block,
        V_cu, I_cu,
        
        iH, iW, N, padH, padW,
        GXS, GYS2, GXS2, div_GXS2[0], div_GXS2[1], div_GXS[0], div_GXS[1],
        shlY, shlX, maskY, shrY, maskX, shrX, shlN, maskN,
        iH * iW * N, iW * N, GYS*GXS*Ci*1152, GXS * Ci * 1152, Ci * 1152,
        GXS, GXS * GYS, GN, Ci)
    Context.synchronize()
    timecheck('calced V_cu')
示例#2
0
def gpu_filter(pixels, width, height):
    size = np.array([width, height])
    filtered = np.zeros_like(pixels)
    grid_dim = (width // BLOCK_SIZE, height // BLOCK_SIZE)
    median_filter(In(pixels),
                  Out(filtered),
                  In(size),
                  block=BLOCK,
                  grid=grid_dim)
    Context.synchronize()
    return filtered
def init_the_device_if_needed(do_it_anyway=False):
    if do_it_anyway:
        print 'import pycuda.autoinit'
        import pycuda.autoinit
        return
    try:
        Context.get_device()
    except:
        # Presumably, the line above failed because of something like that:
        # "LogicError: cuCtxGetDevice failed: not initialized"
        # -- initialize the device
        print 'import pycuda.autoinit'
        import pycuda.autoinit
def init_the_device_if_needed(do_it_anyway=False):
    if do_it_anyway:
        print('import pycuda.autoinit')
        import pycuda.autoinit
        return
    try:
        Context.get_device()
    except:
        # Presumably, the line above failed because of something like that:
        # "LogicError: cuCtxGetDevice failed: not initialized"
        # -- initialize the device
        print('import pycuda.autoinit')
        import pycuda.autoinit
示例#5
0
文件: cuda.py 项目: ALEXGUOQ/chainer
def mem_alloc(nbytes):
    """Allocates device memory of given size from memory pool.

    This function chooses memory pool corresponding to the current device.

    Args:
        nbytes (int): The size of memory in bytes.

    Returns:
        pycuda.tools.PooledDeviceAllocation: Allocated memory with additional
        ``device`` attribute. This attribute is used to determine on which GPU
        the memory resides.

    """
    global _pools

    device = Context.get_device()
    pool   = _pools.get(device, None)

    if pool is None:
        pool = drv.DeviceMemoryPool()
        _pools[device] = pool

    allocation = pool.allocate(nbytes)
    setattr(allocation, 'device', device)
    return allocation
示例#6
0
文件: cuda.py 项目: ALEXGUOQ/chainer
def get_device(arg=None):
    """Gets the device from ID ''arg'' or given chainer's
    :class:`~pycuda.gpuarray.GPUArray`.

    Args:
        arg: Value to specify a GPU device.

    Returns:
        Device object specified by given ``arg``.

        The rule of device selection is following.

        ==================================== =====================================
         Type of ``arg``                      Return value
        ==================================== =====================================
         ``None``                             Current device
         ``int``                              Device of ID ``arg``
         :class:`~pycuda.driver.Device`       ``arg``
         :class:`~pycuda.gpuarray.GPUArray`   Device given array was allocated on
         :class:`~numpy.ndarray`              ``None``
        ==================================== =====================================

    """
    if arg is None:
        return Context.get_device()
    elif isinstance(arg, Device):
        return arg
    elif isinstance(arg, numpy.ndarray):
        return None
    elif isinstance(arg, GPUArray):
        while not hasattr(arg.gpudata, 'device'):
            arg = arg.base
        return arg.gpudata.device
    return drv.Device(arg)
示例#7
0
文件: compiler.py 项目: drufat/pycuda
def compile(source, nvcc="nvcc", options=None, keep=False,
        no_extern_c=False, arch=None, code=None, cache_dir=None,
        include_dirs=[], target="cubin"):

    assert target in ["cubin", "ptx", "fatbin"]

    if not no_extern_c:
        source = 'extern "C" {\n%s\n}\n' % source

    if options is None:
        options = DEFAULT_NVCC_FLAGS

    options = options[:]
    if arch is None:
        from pycuda.driver import Error
        try:
            from pycuda.driver import Context
            arch = "sm_%d%d" % Context.get_device().compute_capability()
        except Error:
            pass

    from pycuda.driver import CUDA_DEBUGGING
    if CUDA_DEBUGGING:
        cache_dir = False
        keep = True
        options.extend(["-g", "-G"])

    if cache_dir is None:
        from os.path import join
        import appdirs
        cache_dir = os.path.join(appdirs.user_cache_dir("pycuda", "pycuda"),
                "compiler-cache-v1")

        from os import makedirs
        try:
            makedirs(cache_dir)
        except OSError as e:
            from errno import EEXIST
            if e.errno != EEXIST:
                raise

    if arch is not None:
        options.extend(["-arch", arch])

    if code is not None:
        options.extend(["-code", code])

    if 'darwin' in sys.platform and sys.maxint == 9223372036854775807:
        options.append('-m64')
    elif 'win32' in sys.platform and sys.maxsize == 9223372036854775807:
        options.append('-m64')
    elif 'win32' in sys.platform and sys.maxsize == 2147483647:
        options.append('-m32')

    include_dirs = include_dirs + [_find_pycuda_include_path()]

    for i in include_dirs:
        options.append("-I"+i)

    return compile_plain(source, options, keep, nvcc, cache_dir, target)
示例#8
0
def compile(source, nvcc="nvcc", options=[], keep=False,
        no_extern_c=False, arch=None, code=None, cache_dir=None,
        include_dirs=[]):

    if not no_extern_c:
        source = 'extern "C" {\n%s\n}\n' % source

    options = options[:]
    if arch is None:
        try:
            from pycuda.driver import Context
            arch = "sm_%d%d" % Context.get_device().compute_capability()
        except RuntimeError:
            pass

    from pycuda.driver import CUDA_DEBUGGING
    if CUDA_DEBUGGING:
        cache_dir = False
        keep = True
        options.extend(["-g", "-G"])

    if cache_dir is None:
        from os.path import join
        from tempfile import gettempdir
        cache_dir = join(gettempdir(), 
                "pycuda-compiler-cache-v1-%s" % _get_per_user_string())

        from os import mkdir
        try:
            mkdir(cache_dir)
        except OSError, e:
            from errno import EEXIST
            if e.errno != EEXIST:
                raise
示例#9
0
 def __init__(self,
              nvcc='nvcc',
              link_options=None,
              keep=False,
              no_extern_c=False,
              arch=None,
              code=None,
              cache_dir=None,
              include_dirs=[],
              message_handler=None,
              log_verbose=False,
              cuda_libdir=None):
     from pycuda.driver import Context
     compute_capability = Context.get_device().compute_capability()
     if compute_capability < (3, 5):
         raise Exception(
             'Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!'
             % (compute_capability[0], compute_capability[1]))
     else:
         from pycuda.driver import Linker
         self.linker = Linker(message_handler, link_options, log_verbose)
     self._check_arch(arch)
     self.nvcc = nvcc
     self.keep = keep
     self.no_extern_c = no_extern_c
     self.arch = arch
     self.code = code
     self.cache_dir = cache_dir
     self.include_dirs = include_dirs
     self.cuda_libdir = cuda_libdir
     self.libdir, self.libptn = None, None
     self.module = None
def calcO(O_cu, M_shape, M_cu):
    GK = M_shape[2]
    GN = M_shape[0]
    tiles = M_shape[4]

    num_xinu_tiles = GK * 32 * GN * 32 * tiles * tiles
    grid = (ceil_div(num_xinu_tiles, 32), 1, 1)
    block = (32, 1, 1)

    call_cu_kernel(
        k_calcO,
        grid, block,
        O_cu, M_cu,
        num_xinu_tiles
    )
    Context.synchronize()
    timecheck('calced O_cu')
示例#11
0
文件: cuda.py 项目: ALEXGUOQ/chainer
def init(device=None):
    """Initializes CUDA global state.

    Chainer maintains CUDA context, CUBLAS context, random number generator and
    device memory pool for each GPU device and for each process (the main
    process or a process forked by :mod:`multiprocessing`) as global states. When
    called for the first time on the process, this function initializes these global states.

    .. warning::

       This function also initializes PyCUDA and scikits.cuda. Since these
       packages do not support forking after initialization, do not call this
       function before forking the process.

    This function also registers :func:`shutdown` to :mod:`atexit` slot.

    It also initializes random number generator. User can set fixed seed with
    ``CHAINER_SEED`` environment variable.

    Args:
        device (``int`` or :class:`~pycuda.driver.Device` or ``None``): Device
            ID to initialize on.

    """
    global _contexts, _cublas_handles, _generators, _pid, _pools

    if not available:
        global _import_error
        raise RuntimeError(
            'CUDA environment is not correctly set up. ' +
            'The original import error said: ' + str(_import_error))

    pid = os.getpid()
    if _pid == pid:  # already initialized
        return

    drv.init()

    if device is None:  # use default device
        context = cutools.make_default_context()
        device  = Context.get_device()
    else:
        device  = Device(device)
        context = device.make_context()
    _contexts       = {device: context}
    _generators     = {}
    _pools          = {}
    _cublas_handles = {}
    cumisc.init(mem_alloc)

    seed(os.environ.get('CHAINER_SEED'))

    _pid = pid  # mark as initialized
    atexit.register(shutdown)
示例#12
0
def ensure_pycuda_context():
    global pycuda_context, pycuda_initialized
    if not pycuda_initialized:
        if Context is None:
            raise RuntimeError("PyCUDA not found or too old.")
        else:
            pycuda_context = Context.attach()
            import atexit
            atexit.register(pycuda_context.detach)
            pycuda_initialized = True
    return pycuda_context
示例#13
0
 def _check_arch(self, arch):
     if arch is None: return
     try:
         from pycuda.driver import Context
         capability = Context.get_device().compute_capability()
         if tuple(map(int, tuple(arch.split("_")[1]))) > capability:
             from warnings import warn
             warn("trying to compile for a compute capability "
                     "higher than selected GPU")
     except:
         pass
def calcM(N, Co, M_cu, U_shape, U_cu, V_shape, V_cu):
    Co = (U_shape[2] - 1) * 32 + U_shape[4]
    Ci = U_shape[3]
    GK   = ceil_div(Co, 32)
    tiles = V_shape[4]
    GN = V_shape[2]
    print('GK', GK, 'GN', GN, 'tiles', tiles, 'Co', Co, 'Ci', Ci, 'N', N)

    grid = (tiles * tiles,1,1) # b
    block = (32, 16, 1)  # 16 for intel...

    call_cu_kernel(
        k_calcM,
        grid, block,
        M_cu, U_cu, V_cu,
        
        Ci, 1, tiles, GN, GK) #,
        # cl.LocalMemory(32 * 32 * 4), cl.LocalMemory(32 * 32 * 4))
    Context.synchronize()
    timecheck('calced M_cu')
def calcU(W_shape, W_cu, U_cu):
    Ci = W_shape[0]
    kH = W_shape[1]
    kW = W_shape[2]
    Co = W_shape[3]

    # this is adapted from neon's winograd_conv.py:
    GK   = ceil_div(Co, 32)

    filter_size   = 1152*Ci*GK
    grid = (GK, Ci, 1)
    block = (32, 1, 1)
    
    call_cu_kernel(
        k_calcU,
        grid, block,
        U_cu, W_cu,
        kH * kW * Co, kW * Co, kW * Co * 2, Co, Ci * 1152,
        Ci, GK)
    Context.synchronize()
    timecheck('calced U_cu')
示例#16
0
    def _init_thread_memory(self, dev_id: int, ctx: cuda.Context,
                            alloc_size: int) -> None:
        '''
        Single thread that initializes the memory for all the stream for a single 
        GPU. 
        '''

        ctx.push()
        size_per_batch = np.int32(np.ceil(alloc_size / self.num_stream))
        # Initialize streams
        for i in range(self.num_stream):
            self.streams[dev_id].append(cuda.Stream())

        for i in range(0, self.num_stream, 1):
            # allocate memory on device
            self.moments_device[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 10))))
            self.w_device[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 9))))
            self.x_device[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 9))))
            self.y_device[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 9))))

            # set host memory for returned output
            self.c_moments[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 7))))
            self.mu[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3))))
            self.yf[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3))))

            self.m1[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 5))))
            self.float_value_set[dev_id](self.m1[dev_id][i],
                                         np.float32(0),
                                         size_per_batch,
                                         size_per_batch,
                                         block=self.block_size,
                                         grid=self.grid_size,
                                         stream=self.streams[dev_id][i])
            self.float_value_set[dev_id](self.m1[dev_id][i],
                                         np.float32(1),
                                         size_per_batch,
                                         np.int32(0),
                                         block=self.block_size,
                                         grid=self.grid_size,
                                         stream=self.streams[dev_id][i])

            self.x1[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3))))
            self.w1[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3))))
            self.x2[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3))))
            self.w2[dev_id].append(
                (cuda.mem_alloc(int(SIZEOF_FLOAT * size_per_batch * 3))))

        ctx.synchronize()
        ctx.pop()
示例#17
0
文件: cuda.py 项目: ALEXGUOQ/chainer
def get_cublas_handle():
    """Gets CUBLAS handle for the current device.

    Returns:
        CUBLAS handle.

    """
    global _cublas_handles

    device = Context.get_device()
    if device in _cublas_handles:
        return _cublas_handles[device]

    handle = cublas.cublasCreate()
    _cublas_handles[device] = handle
    return handle
示例#18
0
    def _set_thread_args(self, dev_id: int, ctx: cuda.Context,
                         moment: np.ndarray, w_out: np.ndarray,
                         x_out: np.ndarray, y_out: np.ndarray):
        '''
        Set the input moment for all the stream for a specific GPU
        '''

        ctx.push()
        # number of input for this GPU
        max_size = moment.shape[1]
        # loop through the streams to set their input
        for i in range(0, self.num_stream, 1):
            # Size of input allocated for each stream
            size_per_batch = int(np.ceil(max_size / self.num_stream))
            # location on the original input array where the input to this stream starts
            loc = np.int32((i) * size_per_batch)
            if loc + size_per_batch > max_size:
                size_per_batch = max_size - loc

            self.moment_chunk_host[dev_id].append(
                np.ascontiguousarray(moment[:, loc:loc + size_per_batch],
                                     dtype=np.float32))
            self.moment_chunk_host[dev_id][i] = cuda.register_host_memory(
                self.moment_chunk_host[dev_id][i],
                cuda.mem_host_register_flags.PORTABLE)
            self.w_chunk_host[dev_id].append(
                np.ascontiguousarray(
                    np.zeros_like(w_out[:, loc:loc + size_per_batch])))
            self.w_chunk_host[dev_id][i] = cuda.register_host_memory(
                self.w_chunk_host[dev_id][i],
                cuda.mem_host_register_flags.PORTABLE)

            self.x_chunk_host[dev_id].append(
                np.ascontiguousarray(
                    np.zeros_like(x_out[:, loc:loc + size_per_batch])))
            self.x_chunk_host[dev_id][i] = cuda.register_host_memory(
                self.x_chunk_host[dev_id][i],
                cuda.mem_host_register_flags.PORTABLE)

            self.y_chunk_host[dev_id].append(
                np.ascontiguousarray(
                    np.zeros_like(y_out[:, loc:loc + size_per_batch])))
            self.y_chunk_host[dev_id][i] = cuda.register_host_memory(
                self.y_chunk_host[dev_id][i],
                cuda.mem_host_register_flags.PORTABLE)

        ctx.synchronize()
        ctx.pop()
示例#19
0
 def __init__(self, nvcc='nvcc', link_options=None, keep=False,
         no_extern_c=False, arch=None, code=None, cache_dir=None,
         include_dirs=[],  message_handler=None, log_verbose=False,
         cuda_libdir=None):
     from pycuda.driver import Context
     compute_capability = Context.get_device().compute_capability()
     if compute_capability < (3,5):
         raise Exception('Minimum compute capability for dynamic parallelism is 3.5 (found: %u.%u)!' %
             (compute_capability[0], compute_capability[1]))
     else:
         from pycuda.driver import Linker
         self.linker = Linker(message_handler, link_options, log_verbose)
     self._check_arch(arch)
     self.nvcc = nvcc
     self.keep = keep
     self.no_extern_c = no_extern_c
     self.arch = arch
     self.code = code
     self.cache_dir = cache_dir
     self.include_dirs = include_dirs
     self.cuda_libdir = cuda_libdir
     self.libdir, self.libptn = None, None
     self.module = None
示例#20
0
def compile(source, nvcc="nvcc", options=None, keep=False,
        no_extern_c=False, arch=None, code=None, cache_dir=None,
        include_dirs=[]):

    if not no_extern_c:
        source = 'extern "C" {\n%s\n}\n' % source

    if options is None:
        options = DEFAULT_NVCC_FLAGS

    options = options[:]
    if arch is None:
        try:
            from pycuda.driver import Context
            arch = "sm_%d%d" % Context.get_device().compute_capability()
        except RuntimeError:
            pass

    from pycuda.driver import CUDA_DEBUGGING
    if CUDA_DEBUGGING:
        cache_dir = False
        keep = True
        options.extend(["-g", "-G"])

    if cache_dir is None:
        from os.path import join
        import appdirs
        cache_dir = os.path.join(appdirs.user_cache_dir("pycuda", "pycuda"),
                "compiler-cache-v1")

        from os import makedirs
        try:
            makedirs(cache_dir)
        except OSError, e:
            from errno import EEXIST
            if e.errno != EEXIST:
                raise
示例#21
0
def has_double_support():
    from pycuda.driver import Context
    return Context.get_device().compute_capability() >= (1, 3)
示例#22
0
def compile(source,
            nvcc="nvcc",
            options=None,
            keep=False,
            no_extern_c=False,
            arch=None,
            code=None,
            cache_dir=None,
            include_dirs=[]):

    if not no_extern_c:
        source = 'extern "C" {\n%s\n}\n' % source

    if options is None:
        options = DEFAULT_NVCC_FLAGS

    options = options[:]
    if arch is None:
        try:
            from pycuda.driver import Context
            arch = "sm_%d%d" % Context.get_device().compute_capability()
        except RuntimeError:
            pass

    from pycuda.driver import CUDA_DEBUGGING
    if CUDA_DEBUGGING:
        cache_dir = False
        keep = True
        options.extend(["-g", "-G"])

    if cache_dir is None:
        from os.path import join
        from tempfile import gettempdir
        cache_dir = join(
            gettempdir(),
            "pycuda-compiler-cache-v1-%s" % _get_per_user_string())

        from os import mkdir
        try:
            mkdir(cache_dir)
        except OSError as e:
            from errno import EEXIST
            if e.errno != EEXIST:
                raise

    if arch is not None:
        options.extend(["-arch", arch])

    if code is not None:
        options.extend(["-code", code])

    if 'darwin' in sys.platform and sys.maxint == 9223372036854775807:
        options.append('-m64')
    elif 'win32' in sys.platform and sys.maxsize == 9223372036854775807:
        options.append('-m64')
    elif 'win32' in sys.platform and sys.maxsize == 2147483647:
        options.append('-m32')

    include_dirs = include_dirs + [_find_pycuda_include_path()]

    for i in include_dirs:
        options.append("-I" + i)

    return compile_plain(source, options, keep, nvcc, cache_dir)
示例#23
0
def compile(source,
            nvcc="nvcc",
            options=None,
            keep=False,
            no_extern_c=False,
            arch=None,
            code=None,
            cache_dir=None,
            include_dirs=[],
            target="cubin"):

    assert target in ["cubin", "ptx", "fatbin"]

    if not no_extern_c:
        source = 'extern "C" {\n%s\n}\n' % source

    if options is None:
        options = DEFAULT_NVCC_FLAGS

    options = options[:]
    if arch is None:
        from pycuda.driver import Error
        try:
            from pycuda.driver import Context
            arch = "sm_%d%d" % Context.get_device().compute_capability()
        except Error:
            pass

    from pycuda.driver import CUDA_DEBUGGING
    if CUDA_DEBUGGING:
        cache_dir = False
        keep = True
        options.extend(["-g", "-G"])

    if "PYCUDA_CACHE_DIR" in os.environ and cache_dir is None:
        cache_dir = os.environ["PYCUDA_CACHE_DIR"]

    if "PYCUDA_DISABLE_CACHE" in os.environ:
        cache_dir = False

    if cache_dir is None:
        from os.path import join
        import appdirs
        cache_dir = os.path.join(appdirs.user_cache_dir("pycuda", "pycuda"),
                                 "compiler-cache-v1")

        from os import makedirs
        try:
            makedirs(cache_dir)
        except OSError as e:
            from errno import EEXIST
            if e.errno != EEXIST:
                raise

    if arch is not None:
        options.extend(["-arch", arch])

    if code is not None:
        options.extend(["-code", code])

    if 'darwin' in sys.platform and sys.maxsize == 9223372036854775807:
        options.append('-m64')
    elif 'win32' in sys.platform and sys.maxsize == 9223372036854775807:
        options.append('-m64')
    elif 'win32' in sys.platform and sys.maxsize == 2147483647:
        options.append('-m32')

    include_dirs = include_dirs + [_find_pycuda_include_path()]

    for i in include_dirs:
        options.append("-I" + i)

    return compile_plain(source, options, keep, nvcc, cache_dir, target)
示例#24
0
def has_stack():
    from pycuda.driver import Context
    return Context.get_device().compute_capability() >= (2, 0)
def process(iH, iW, N, Ci, Co, kH=3, kW=3):
    inittime()
    np.random.seed(123)

    oH = iH
    oW = iW
    
    tiles = iW // 4

    shlN = 5
    shlY, shlX, maskY, shrY, maskX, shrX, maskN, supY, supX = {
        0 : (4, 5, 0x18, 3, 0x07, 0, 0x00, 0x203, 0x300), # 4x8  yyxxx
        1 : (4, 4, 0x18, 3, 0x06, 1, 0x01, 0x203, 0x201), # 4x4  yyxxn
        2 : (3, 4, 0x10, 4, 0x0c, 2, 0x03, 0x104, 0x202), # 2x4  yxxnn
        3 : (2, 4, 0x00, 0, 0x18, 3, 0x07, 0x000, 0x203), # 1x4  xxnnn
        4 : (2, 3, 0x00, 0, 0x10, 4, 0x0f, 0x000, 0x104), # 1x2  xnnnn
        5 : (2, 2, 0x00, 0, 0x00, 0, 0x1f, 0x000, 0x000), # 1x1  nnnnn
    }.get(shlN)

    GYS  = ceil_div(oH, 1 << shlY)
    GXS  = ceil_div(oW, 1 << shlX)
    GN   = ceil_div(N, 1 << shlN)
    # GK   = ceil_div(Co, 32)
    GYS2 = GYS // 2
    GXS2 = GXS  * 2

    GK   = ceil_div(Co, 32)

    W = np.random.randn(Ci,kH,kW,Co).astype(np.float32)

    I = np.zeros((Ci,iH, iW,N), dtype=np.float32)
    I[:] = np.random.randn(*I.shape)

    print('Co', Co, 'iH', iH, 'iW', iW, 'N', N, 'tiles', tiles)

    W_cu = gpuarray.to_gpu(W)
    I_cu = gpuarray.to_gpu(I)

    U = np.zeros((6, 6, GK, Ci, 32,), dtype=np.float32)
    U_cu = gpuarray.to_gpu(U)

    V = np.zeros((6, 6, GN,GXS, GYS, Ci, 32), dtype=np.float32)
    V_cu = gpuarray.to_gpu(V)

    M = np.zeros((GN, 32, GK, 32, tiles, tiles, 6, 6,), dtype=np.float32)
    M_cu = gpuarray.to_gpu(M)

    O = np.zeros((GN, 32, GK, 32, tiles, tiles, 4, 4,), dtype=np.float32)
    O_cu = gpuarray.to_gpu(O)

    Context.synchronize()
    print('allocated buffers')
    start = time.time()

    for it in range(3):
        calcU(U_cu=U_cu, W_shape=W.shape, W_cu=W_cu)
        calcV(V_cu=V_cu, I_shape=I.shape, I_cu=I_cu)
        calcM(N=N, Co=Co, M_cu=M_cu, U_shape=U.shape, U_cu=U_cu, V_shape=V.shape, V_cu=V_cu)

        calcO(O_cu=O_cu, M_shape=M.shape, M_cu=M_cu)

        Context.synchronize()
        end = time.time()
        print('calcs done')
        print('time for all calcs:', end - start)
        start = time.time()

    O = O_cu.get()
    # cl.enqueue_copy(q, O, O_cu)

    O = O.transpose(2,3, 4,6, 5,7, 0,1).reshape(
        GK * 32, tiles * 4, tiles * 4, GN * 32)
    print('O.shape', O.shape)

    W_from_cu = np.zeros((Ci, 3, 3, Co), dtype=np.float32)
    W_from_cu = W_cu.get()

    U_from_cpu = winograd_cpu.calcU(W=W)
    U_from_cu = np.zeros((6, 6, GK, Ci, 32), dtype=np.float32)
    U_from_cu = U_cu.get()
    U_from_cu_ = U_from_cu.transpose(
        0, 1, 2, 4, 3).reshape(6, 6, GK * 32, Ci)[:, :, :Co]
    assert np.allclose(U_from_cu_, U_from_cpu, atol=1e-4)

    V_from_cpu = winograd_cpu.calcV(I=I)
    V_from_cu = np.copy(V)
    V_from_cu = V_cu.get()
    
    print('tiles', tiles)
    # 0  1   2   3    4   5   6
    # 6, 6, GN,GXS, GYS, Ci, 32
    V_from_cu_ = V_from_cu.transpose(
        2,6,0,1,5,3,4).reshape(
        GN * 32, 6, 6, Ci, tiles, tiles)[:N]
    
    assert np.allclose(V_from_cu_, V_from_cpu, atol=1e-3)

    #      0       1         2        3   4   5   6   7
    # [n//32][n % 32][co // 32][co % 32][th][tw][xi][nu]
    M_from_cpu = winograd_cpu.calcM(U=U_from_cu, V=V_from_cu, N=N, Co=Co)
    M_from_cu = np.copy(M)
    M_from_cu = M_cu.get()
    Context.synchronize()
    M_from_cu = M_from_cu.reshape(GN * 32, GK * 32, tiles, tiles, 6, 6)[:N, :Co]
    
    print(M_from_cu.reshape(M_from_cu.size)[:20])
    
    assert np.allclose(M_from_cu, M_from_cpu, atol=1e-2)
    
    #np.transpose(V_from_cu, [2, 6, 4, 5, 3, 0, 1])
    #V_from_cu = V_from_cu.reshape(GN * 32, 6, 6, Ci, tiles, tiles)[:N,:,:,:,:,:]
    
    return {'W': W, 'O': O, 'I': I}
示例#26
0
def has_stack():
    from pycuda.driver import Context
    return Context.get_device().compute_capability() >= (2, 0)
示例#27
0
def has_double_support():
    from pycuda.driver import Context
    return Context.get_device().compute_capability() >= (1, 3)
示例#28
0
        Ts[i_20]+=An[8]/my_factorial;
        Ts[i_21]+=An[9]/my_factorial;
        Ts[i_22]+=An[10]/my_factorial;
        Ts[i_23]+=An[11]/my_factorial;
        
        
    }
        


    
}
}
"""
try:
    Context.get_device()
except:
    import pycuda.autoinit
mod = SourceModule(krnl, no_extern_c=True)
_gpu_expm = mod.get_function("expm")


def gpu_expm(As, Ts_vectorized, p=12):
    N = len(As)
    if Ts_vectorized.ndim != 2 or Ts_vectorized.shape[1] != 12:
        raise ValueError(Ts_vectorized.shape)


#    threadsPerBlock=1024 # Regardless of the value of N,
# for some reasons this gives errors,
# (only) on the machines with the good graphics
示例#29
0
#!/usr/bin/env python
"""
Created on Wed Sep  3 11:08:37 2014

Author: Oren Freifeld
Email: [email protected]
"""

from pycuda.compiler import SourceModule
from pycuda.driver import Context


try:
    Context.get_device()
except:
    import pycuda.autoinit


class KernelThinWrapper(object):
    def __init__(self, gpu_kernel, include_dirs=[]):
        self._gpu_kernel = gpu_kernel
        self._src_module = SourceModule(gpu_kernel, include_dirs=include_dirs)

    def _get_function_from_src_module(self, func_name):
        self.__dict__["_gpu_" + func_name] = self._src_module.get_function(func_name)

    def __call__(self, *args, **kwargs):
        msg = """
        You need to customize this method in the derived class.
        
        The customized method will usually have 3 parts: