예제 #1
0
def init_cuda():
    global COMPILED_MODULE
    cuda.init()
    context = tools.make_default_context()
    device = context.get_device()
    atexit.register(context.pop)
    COMPILED_MODULE = buffers_cu.percentile_buffer()
예제 #2
0
def suite():
    context = make_default_context()
    device = context.get_device()
    context.pop()

    s = TestSuite()
    s.addTest(test_fft('test_fft_float32_to_complex64_1d'))
    s.addTest(test_fft('test_fft_float32_to_complex64_2d'))
    s.addTest(test_fft('test_batch_fft_float32_to_complex64_1d'))
    s.addTest(test_fft('test_batch_fft_float32_to_complex64_2d'))
    s.addTest(test_fft('test_ifft_complex64_to_float32_1d'))
    s.addTest(test_fft('test_ifft_complex64_to_float32_2d'))
    s.addTest(test_fft('test_batch_ifft_complex64_to_float32_1d'))
    s.addTest(test_fft('test_batch_ifft_complex64_to_float32_2d'))
    s.addTest(test_fft('test_multiple_streams'))
    s.addTest(test_fft('test_work_area'))
    if misc.get_compute_capability(device) >= 1.3:
        s.addTest(test_fft('test_fft_float64_to_complex128_1d'))
        s.addTest(test_fft('test_fft_float64_to_complex128_2d'))
        s.addTest(test_fft('test_batch_fft_float64_to_complex128_1d'))
        s.addTest(test_fft('test_batch_fft_float64_to_complex128_2d'))
        s.addTest(test_fft('test_ifft_complex128_to_float64_1d'))
        s.addTest(test_fft('test_ifft_complex128_to_float64_2d'))
        s.addTest(test_fft('test_batch_ifft_complex128_to_float64_1d'))
        s.addTest(test_fft('test_batch_ifft_complex128_to_float64_2d'))
    return s
def gpuErrorEvaluate(actual, expected):
    context = make_default_context()
    device = context.get_device()
    p=gpuarray.to_gpu(numpy.array(actual))- gpuarray.to_gpu(numpy.array(expected))
    res= 1.0 - gpuarray.dot(p,p)
    context.pop()
    return res
예제 #4
0
    def init_cuda(self):
        # pycuda alloc
        drv.init()
        global context
        from pycuda.tools import make_default_context
        context = make_default_context()

        mod = SourceModule(r"""
                        #include <stdio.h>
                        #include <math.h>
                        #define PI 3.14159265
                        __global__ void detect(int data[][2], int* rad, int* range, unsigned char *frame, int *pcol) {
                                for(int r = 0; r < rad[0]; r++) {
                                    const int thetaIdx = threadIdx.x;
                                    const int theta = thetaIdx + range[0];
                                    int x = rad[0] + int(r * cos(theta * PI/180)) - 1;
                                    int y = rad[0] - int(r * sin(theta * PI/180)) - 1;
                                    if (data[thetaIdx][0] == 0) data[thetaIdx][1] = r;
                                    if (*(frame + y * *pcol + x) != 0) data[thetaIdx][0] = 1;
                                }
                        }
                        """)

        self.path = mod.get_function("detect")
        print("PLANNER: pycuda alloc end")
        # pycuda alloc end

        time.sleep(2)
예제 #5
0
 def init_context(self, device_id=None):
     if device_id is None:
         context = make_default_context()
         self._context = context
     else:
         context = cuda.Device(device_id).make_context()
         self._context = context
예제 #6
0
 def init_context(self, device_id=None):
     if device_id is None:
         context = make_default_context()
         self._context = context
     else:
         self._context = driver.Device(device_id).make_context()
         self._context.push()
예제 #7
0
 def init_context(self, device_id=None):
     if device_id is None:
         context = make_default_context()
         self._context = context
     else:
         context = cuda.Device(device_id).make_context()
         self._context = context
예제 #8
0
 def setUpClass(cls):
     np.random.seed(0)
     cls.ctx = make_default_context()
     cls.handle = cublasxt.cublasXtCreate()
     cls.nbDevices = 1
     cls.deviceId = np.array([0], np.int32)
     cublasxt.cublasXtDeviceSelect(cls.handle, cls.nbDevices, cls.deviceId)
예제 #9
0
 def setUpClass(cls):
     np.random.seed(0)
     cls.ctx = make_default_context()
     cls.handle = cublasxt.cublasXtCreate()
     cls.nbDevices = 1
     cls.deviceId = np.array([0], np.int32)
     cublasxt.cublasXtDeviceSelect(cls.handle, cls.nbDevices,
                                   cls.deviceId)
예제 #10
0
파일: __init__.py 프로젝트: altus88/striate
def init():
  # MAGIC MAGIC
  import pycuda.driver as cuda
  cuda.init()
  from pycuda.tools import make_default_context
  context = make_default_context()
  device = context.get_device()
  import atexit
  atexit.register(context.detach)
예제 #11
0
 def __init__(self, coord_format=CoordFormat.nx, scalar=False, threadsperblock=64, types={}):
   super().__init__(coord_format, scalar, threadsperblock, types)
   self._context = make_default_context()
   self._device = self._context.get_device()
   mod = SourceModule(self.source)
   self.update_v = mod.get_function("update_v")
   self.update_p = mod.get_function("update_p")
   self.rebound = mod.get_function("rebound")
   self.update = mod.get_function("update")
예제 #12
0
def examine_subsets_cuda(task, A, N, K, threads_per_block):
    # Unpack the task tuple
    subset_start, subset_end = task

    # Create CUDA context
    cuda.init()
    ctx = make_default_context()

    KFAC = math.factorial(K)

    # Keep track of the stride
    stride = subset_end - subset_start

    # Copy A to the GPU
    device_A = cuda.to_device(A.astype(np.int16))

    # Create results array
    results = np.zeros(stride, dtype=np.int32)

    # Copy results array
    device_results = cuda.to_device(results)

    # Number of CUDA blocks
    cuda_blocks = ((stride + threads_per_block - 1) / threads_per_block)

    # Compile CUDA kernel
    mod = SourceModule(
        open(
            "/Users/rsearles/Documents/Repositories/cisc849-16s/project/src/Spike_Neural_Nets/subsets.cu",
            "r").read())
    kernel = mod.get_function("examine_subsets")

    # Run kernel
    kernel(np.int32(N),
           np.int32(K),
           np.int64(subset_start),
           np.int64(subset_end),
           np.int32(KFAC),
           device_A,
           device_results,
           block=(cuda_blocks, 1, 1),
           grid=(threads_per_block, 1))

    # Copy results back
    results = cuda.from_device_like(device_results, results)

    # Free GPU memory
    device_results.free()
    device_A.free()

    # Pop CUDA context (driver yells otherwise)
    ctx.pop()

    # Return the counts
    counts = Counter(results)
    return counts
예제 #13
0
def suite():
    context = make_default_context()
    device = context.get_device()
    context.pop()

    s = TestSuite()
    s.addTest(test_misc('test_maxabs_float32'))
    s.addTest(test_misc('test_maxabs_complex64'))
    s.addTest(test_misc('test_cumsum_float32'))
    s.addTest(test_misc('test_cumsum_complex64'))
    s.addTest(test_misc('test_diff_float32'))
    s.addTest(test_misc('test_diff_complex64'))
    s.addTest(test_misc('test_get_by_index_float32'))
    s.addTest(test_misc('test_set_by_index_dest_float32'))
    s.addTest(test_misc('test_set_by_index_src_float32'))
    s.addTest(test_misc('test_binaryop_2d_int32'))
    s.addTest(test_misc('test_binaryop_2d_float32'))
    s.addTest(test_misc('test_binaryop_2d_complex64'))
    s.addTest(test_misc('test_binaryop_matvec_int32'))
    s.addTest(test_misc('test_binaryop_matvec_float32'))
    s.addTest(test_misc('test_binaryop_matvec_complex64'))
    s.addTest(test_misc('test_sum_float32'))
    s.addTest(test_misc('test_sum_complex64'))
    s.addTest(test_misc('test_mean_float32'))
    s.addTest(test_misc('test_mean_complex64'))
    s.addTest(test_misc('test_var_float32'))
    s.addTest(test_misc('test_var_complex64'))
    s.addTest(test_misc('test_std_float32'))
    s.addTest(test_misc('test_std_complex64'))
    s.addTest(test_misc('test_minmax_float32'))
    s.addTest(test_misc('test_argminmax_float32'))
    if misc.get_compute_capability(device) >= 1.3:
        s.addTest(test_misc('test_maxabs_float64'))
        s.addTest(test_misc('test_maxabs_complex128'))
        s.addTest(test_misc('test_cumsum_float64'))
        s.addTest(test_misc('test_cumsum_complex128'))
        s.addTest(test_misc('test_diff_float64'))
        s.addTest(test_misc('test_diff_complex128'))
        s.addTest(test_misc('test_get_by_index_float64'))
        s.addTest(test_misc('test_set_by_index_dest_float64'))
        s.addTest(test_misc('test_set_by_index_src_float64'))
        s.addTest(test_misc('test_sum_float64'))
        s.addTest(test_misc('test_sum_complex128'))
        s.addTest(test_misc('test_mean_float64'))
        s.addTest(test_misc('test_mean_complex128'))
        s.addTest(test_misc('test_binaryop_2d_float64'))
        s.addTest(test_misc('test_binaryop_2d_complex128'))
        s.addTest(test_misc('test_binaryop_matvec_float64'))
        s.addTest(test_misc('test_binaryop_matvec_complex128'))
        s.addTest(test_misc('test_var_float64'))
        s.addTest(test_misc('test_var_complex128'))
        s.addTest(test_misc('test_std_float64'))
        s.addTest(test_misc('test_std_complex128'))
    s.addTest(test_misc('test_minmax_float64'))
    s.addTest(test_misc('test_argminmax_float64'))
    return s
예제 #14
0
def suite():
    context = make_default_context()
    device = context.get_device()
    context.pop()

    s = TestSuite()
    s.addTest(test_misc('test_maxabs_float32'))
    s.addTest(test_misc('test_maxabs_complex64'))
    s.addTest(test_misc('test_cumsum_float32'))
    s.addTest(test_misc('test_cumsum_complex64'))
    s.addTest(test_misc('test_diff_float32'))
    s.addTest(test_misc('test_diff_complex64'))
    s.addTest(test_misc('test_get_by_index_float32'))
    s.addTest(test_misc('test_set_by_index_dest_float32'))
    s.addTest(test_misc('test_set_by_index_src_float32'))
    s.addTest(test_misc('test_binaryop_2d_int32'))
    s.addTest(test_misc('test_binaryop_2d_float32'))
    s.addTest(test_misc('test_binaryop_2d_complex64'))
    s.addTest(test_misc('test_binaryop_matvec_int32'))
    s.addTest(test_misc('test_binaryop_matvec_float32'))
    s.addTest(test_misc('test_binaryop_matvec_complex64'))
    s.addTest(test_misc('test_sum_float32'))
    s.addTest(test_misc('test_sum_complex64'))
    s.addTest(test_misc('test_mean_float32'))
    s.addTest(test_misc('test_mean_complex64'))
    s.addTest(test_misc('test_var_float32'))
    s.addTest(test_misc('test_var_complex64'))
    s.addTest(test_misc('test_std_float32'))
    s.addTest(test_misc('test_std_complex64'))
    s.addTest(test_misc('test_minmax_float32'))
    s.addTest(test_misc('test_argminmax_float32'))
    if misc.get_compute_capability(device) >= 1.3:
        s.addTest(test_misc('test_maxabs_float64'))
        s.addTest(test_misc('test_maxabs_complex128'))
        s.addTest(test_misc('test_cumsum_float64'))
        s.addTest(test_misc('test_cumsum_complex128'))
        s.addTest(test_misc('test_diff_float64'))
        s.addTest(test_misc('test_diff_complex128'))
        s.addTest(test_misc('test_get_by_index_float64'))
        s.addTest(test_misc('test_set_by_index_dest_float64'))
        s.addTest(test_misc('test_set_by_index_src_float64'))
        s.addTest(test_misc('test_sum_float64'))
        s.addTest(test_misc('test_sum_complex128'))
        s.addTest(test_misc('test_mean_float64'))
        s.addTest(test_misc('test_mean_complex128'))
        s.addTest(test_misc('test_binaryop_2d_float64'))
        s.addTest(test_misc('test_binaryop_2d_complex128'))
        s.addTest(test_misc('test_binaryop_matvec_float64'))
        s.addTest(test_misc('test_binaryop_matvec_complex128'))
        s.addTest(test_misc('test_var_float64'))
        s.addTest(test_misc('test_var_complex128'))
        s.addTest(test_misc('test_std_float64'))
        s.addTest(test_misc('test_std_complex128'))
    s.addTest(test_misc('test_minmax_float64'))
    s.addTest(test_misc('test_argminmax_float64'))
    return s
예제 #15
0
파일: __init__.py 프로젝트: iskandr/striate
def init():
  # MAGIC MAGIC
  from pycuda import driver
  driver.init()
  from pycuda.tools import make_default_context
  context = make_default_context()
  device = context.get_device()
  import atexit
  atexit.register(context.detach)

  return context
예제 #16
0
파일: cuda.py 프로젝트: ALEXGUOQ/chainer
def init(device=None):
    """Initializes CUDA global state.

    Chainer maintains CUDA context, CUBLAS context, random number generator and
    device memory pool for each GPU device and for each process (the main
    process or a process forked by :mod:`multiprocessing`) as global states. When
    called for the first time on the process, this function initializes these global states.

    .. warning::

       This function also initializes PyCUDA and scikits.cuda. Since these
       packages do not support forking after initialization, do not call this
       function before forking the process.

    This function also registers :func:`shutdown` to :mod:`atexit` slot.

    It also initializes random number generator. User can set fixed seed with
    ``CHAINER_SEED`` environment variable.

    Args:
        device (``int`` or :class:`~pycuda.driver.Device` or ``None``): Device
            ID to initialize on.

    """
    global _contexts, _cublas_handles, _generators, _pid, _pools

    if not available:
        global _import_error
        raise RuntimeError(
            'CUDA environment is not correctly set up. ' +
            'The original import error said: ' + str(_import_error))

    pid = os.getpid()
    if _pid == pid:  # already initialized
        return

    drv.init()

    if device is None:  # use default device
        context = cutools.make_default_context()
        device  = Context.get_device()
    else:
        device  = Device(device)
        context = device.make_context()
    _contexts       = {device: context}
    _generators     = {}
    _pools          = {}
    _cublas_handles = {}
    cumisc.init(mem_alloc)

    seed(os.environ.get('CHAINER_SEED'))

    _pid = pid  # mark as initialized
    atexit.register(shutdown)
예제 #17
0
파일: __init__.py 프로젝트: wqren/striate
def init():
    # MAGIC MAGIC
    from pycuda import driver
    driver.init()
    from pycuda.tools import make_default_context
    context = make_default_context()
    device = context.get_device()
    import atexit
    atexit.register(context.detach)

    return context
예제 #18
0
def setup_pyfft(device_id=0):
    global CONTEXTS
    if device_id not in CONTEXTS:
        print('initializing GPU device', device_id)
        cuda.init()
        if device_id is None:
            CONTEXTS[0] = make_default_context()
        else:
            dev = cuda.Device(device_id)
            CONTEXTS[device_id] = dev.make_context()
            CONTEXTS[device_id].pop()
    return CONTEXTS[device_id]
예제 #19
0
def suite():
    """test suite
    """
    context = make_default_context()
    device = context.get_device()
    context.pop()

    testsuite = TestSuite()
    testsuite.addTest(test_magma('test_magma_geev_novecs'))
    testsuite.addTest(test_magma('test_symmetric_eig_float32'))
    testsuite.addTest(test_magma('test_symmetric_eig_float64'))
    return testsuite
예제 #20
0
def suite():
    context = make_default_context()
    device = context.get_device()
    context.pop()

    s = TestSuite()
    s.addTest(test_cublasxt('test_cublasXtSgemm'))
    s.addTest(test_cublasxt('test_cublasXtCgemm'))
    if misc.get_compute_capability(device) >= 1.3:
        s.addTest(test_cublasxt('test_cublasXtDgemm'))
        s.addTest(test_cublasxt('test_cublasXtZgemm'))
    return s
예제 #21
0
def suite():
    context = make_default_context()
    device = context.get_device()
    context.pop()

    s = TestSuite()
    s.addTest(test_cublasxt('test_cublasXtSgemm'))
    s.addTest(test_cublasxt('test_cublasXtCgemm'))
    if misc.get_compute_capability(device) >= 1.3:
        s.addTest(test_cublasxt('test_cublasXtDgemm'))
        s.addTest(test_cublasxt('test_cublasXtZgemm'))
    return s
예제 #22
0
def suite():
    context = make_default_context()
    device = context.get_device()
    context.pop()

    s = TestSuite()
    s.addTest(test_special('test_sici_float32'))
    s.addTest(test_special('test_exp1_complex64'))
    s.addTest(test_special('test_expi_complex64'))
    if misc.get_compute_capability(device) >= 1.3:
        s.addTest(test_special('test_sici_float64'))
        s.addTest(test_special('test_exp1_complex128'))
        s.addTest(test_special('test_expi_complex128'))
    return s
예제 #23
0
def prep(image, psf):
    datadim1 = image.shape[0]
    datadim2 = image.shape[1]
    if datadim1 != datadim2:
        ddim = max(datadim1, datadim2)
        s = numpy.binary_repr(ddim - 1)
        s = s[:-1] + '0'  # Guarantee that padding is used
    else:
        ddim = datadim1
        s = numpy.binary_repr(ddim - 1)
    if s.find('0') > 0:
        size = 2**len(s)
        boxd = numpy.zeros((size, size))
        r = size - datadim1
        r1 = r2 = r / 2
        if r % 2 == 1:
            r1 = r // 2 + 1
        c = size - datadim2
        c1 = c2 = c // 2
        if c % 2 == 1:
            c1 = c // 2 + 1
        boxdslice = (slice(r1, datadim1 + r1), slice(c1, datadim2 + c1))
        boxd[boxdslice] = image
    else:
        boxd = image

    boxp = boxd * 0.
    if boxd.shape[0] == psf.shape[0]:
        boxp = psf.copy()
    else:
        r = boxp.shape[0] - psf.shape[0]
        r1 = r // 2 + 1
        c = boxp.shape[1] - psf.shape[1]
        c1 = c // 2 + 1
        boxpslice = (slice(r1,
                           psf.shape[0] + r1), slice(c1, psf.shape[1] + c1))
        boxp[boxpslice] = psf.copy()

    from pyfft.cuda import Plan
    import pycuda.driver as cuda
    from pycuda.tools import make_default_context
    import pycuda.gpuarray as gpuarray
    cuda.init()
    context = make_default_context()
    stream = cuda.Stream()

    plan = Plan(boxp.shape, stream=stream)
    gdata = gpuarray.to_gpu(boxp.astype(numpy.complex64))
    plan.execute(gdata)
    return gdata, boxd.shape, boxdslice, plan, stream
예제 #24
0
    def filter(self):
        import pycuda.gpuarray as gpuarray
        import skcuda.fft as cu_fft
        import skcuda.linalg as linalg
        import pycuda.driver as cuda
        from pycuda.tools import make_default_context
        cuda.init()
        context = make_default_context()
        device = context.get_device()
        signal = self.series[0]
        window = self.series[1]
        linalg.init()
        nfft = determine_size(len(signal) + len(window) - 1)
        # Move data to GPU
        sig_zero_pad = np.zeros(nfft, dtype=self.precision['float'])
        win_zero_pad = np.zeros(nfft, dtype=self.precision['float'])
        sig_gpu = gpuarray.zeros(sig_zero_pad.shape,
                                 dtype=self.precision['float'])
        win_gpu = gpuarray.zeros(win_zero_pad.shape,
                                 dtype=self.precision['float'])
        sig_zero_pad[0:len(signal)] = signal
        win_zero_pad[0:len(window)] = window
        sig_gpu.set(sig_zero_pad)
        win_gpu.set(win_zero_pad)

        # Plan forwards
        sig_fft_gpu = gpuarray.zeros(nfft, dtype=self.precision['complex'])
        win_fft_gpu = gpuarray.zeros(nfft, dtype=self.precision['complex'])
        sig_plan_forward = cu_fft.Plan(sig_fft_gpu.shape,
                                       self.precision['float'],
                                       self.precision['complex'])
        win_plan_forward = cu_fft.Plan(win_fft_gpu.shape,
                                       self.precision['float'],
                                       self.precision['complex'])
        cu_fft.fft(sig_gpu, sig_fft_gpu, sig_plan_forward)
        cu_fft.fft(win_gpu, win_fft_gpu, win_plan_forward)

        # Convolve
        out_fft = linalg.multiply(sig_fft_gpu, win_fft_gpu, overwrite=True)
        linalg.scale(2.0, out_fft)

        # Plan inverse
        out_gpu = gpuarray.zeros_like(out_fft)
        plan_inverse = cu_fft.Plan(out_fft.shape, self.precision['complex'],
                                   self.precision['complex'])
        cu_fft.ifft(out_fft, out_gpu, plan_inverse, True)
        out_np = np.zeros(len(out_gpu), self.precision['complex'])
        out_gpu.get(out_np)
        context.pop()
        return out_np
예제 #25
0
def suite():
    context = make_default_context()
    device = context.get_device()
    context.pop()

    s = TestSuite()
    s.addTest(test_special('test_sici_float32'))
    s.addTest(test_special('test_exp1_complex64'))
    s.addTest(test_special('test_expi_complex64'))
    if misc.get_compute_capability(device) >= 1.3:
        s.addTest(test_special('test_sici_float64'))
        s.addTest(test_special('test_exp1_complex128'))
        s.addTest(test_special('test_expi_complex128'))
    return s
예제 #26
0
파일: convolve.py 프로젝트: bnord/LensPop
def prep(image,psf):
    datadim1 = image.shape[0]
    datadim2 = image.shape[1]
    if datadim1!=datadim2:
        ddim = max(datadim1,datadim2)
        s = numpy.binary_repr(ddim-1)
        s = s[:-1]+'0' # Guarantee that padding is used
    else:
        ddim = datadim1
        s = numpy.binary_repr(ddim-1)
    if s.find('0')>0:
        size = 2**len(s)
        boxd = numpy.zeros((size,size))
        r = size-datadim1
        r1 = r2 = r/2
        if r%2==1:
            r1 = r/2+1
        c = size-datadim2
        c1 = c2 = c/2
        if c%2==1:
            c1 = c/2+1
        boxdslice = (slice(r1,datadim1+r1),slice(c1,datadim2+c1))
        boxd[boxdslice] = image
    else:
        boxd = image

    boxp = boxd*0.
    if boxd.shape[0]==psf.shape[0]:
        boxp = psf.copy()
    else:
        r = boxp.shape[0]-psf.shape[0]
        r1 = r/2+1
        c = boxp.shape[1]-psf.shape[1]
        c1 = c/2+1
        boxpslice = (slice(r1,psf.shape[0]+r1),slice(c1,psf.shape[1]+c1))
        boxp[boxpslice] = psf.copy()

    from pyfft.cuda import Plan
    import pycuda.driver as cuda
    from pycuda.tools import make_default_context
    import pycuda.gpuarray as gpuarray
    cuda.init()
    context = make_default_context()
    stream = cuda.Stream()

    plan = Plan(boxp.shape,stream=stream)
    gdata = gpuarray.to_gpu(boxp.astype(numpy.complex64))
    plan.execute(gdata)
    return gdata,boxd.shape,boxdslice,plan,stream
예제 #27
0
파일: util.py 프로젝트: yk/tensorpack
    def _setup(self):
        ctx = None
        try:
            import pycuda.driver as pd
            import pycuda.tools as pt
            pd.init()
            ctx = pt.make_default_context()
        except:
            pass

        try:
            super()._setup()
        finally:
            if ctx is not None:
                ctx.detach()
예제 #28
0
def cuda_init(shape):
    try:
        import pyfft, pycuda
    except ImportError:
        out(
            2,
            'No CUDA bindings found (pyfft). Using regular convolution modules'
        )
        return None, None
    out(2, 'CUDA bindings found!')
    from pycuda.tools import make_default_context
    #    import pycuda.gpuarray as gpuarray
    import pycuda.driver as cuda
    cuda.init()  #@UndefinedVariable
    context = make_default_context()
    return context, get_pyfft_plan(shape)
예제 #29
0
def suite():
    context = make_default_context()
    device = context.get_device()
    context.pop()

    s = TestSuite()
    s.addTest(test_integrate('test_trapz_float32'))
    s.addTest(test_integrate('test_trapz_complex64'))
    s.addTest(test_integrate('test_trapz2d_float32'))
    s.addTest(test_integrate('test_trapz2d_complex64'))
    if misc.get_compute_capability(device) >= 1.3:
        s.addTest(test_integrate('test_trapz_float64'))
        s.addTest(test_integrate('test_trapz_complex128'))
        s.addTest(test_integrate('test_trapz2d_float64'))
        s.addTest(test_integrate('test_trapz2d_complex128'))
    return s
예제 #30
0
def Add_gpu(x, y):
    cuda.init()
    ctx = make_default_context()
    device = ctx.get_device()

    a = numpy.random.randn(4, 4)
    a = a.astype(numpy.float32)
    a_gpu = cuda.mem_alloc(a.nbytes)
    cuda.memcpy_htod(a_gpu, a)
    '''
    x_gpu = cuda.mem_alloc(sys.getsizeof(x))
    y_gpu = cuda.mem_alloc(sys.getsizeof(y))
    z_gpu = cuda.mem_alloc(sys.getsizeof(x))
    cuda.memcpy_htod(x_gpu, x)
    cuda.memcpy_htod(y_gpu, y)
    
    mod = SourceModule("""
      __global__ void add(int *x_addr, int *y_addr, int* z_addr)
      {
          // naive computation
          sum = *x_addr + *y_addr;
          *z_addr = sum;
      }
    """)

    func = mod.get_function("add")
    func(x_gpu, y_gpu, z_gpu, block=(4,4,1))
    '''

    mod = SourceModule("""
      __global__ void doublify(float *a)
      {
          int idx = threadIdx.x + threadIdx.y*4;
          a[idx] *= 2;
      }
    """)
    func = mod.get_function("doublify")
    func(a_gpu, block=(4, 4, 1))

    #cuda.memcpy_dtoh(x, z_gpu)
    a_doubled = numpy.empty_like(a)
    cuda.memcpy_dtoh(a_doubled, a_gpu)

    ctx.pop()

    return 2 * (x + y)
예제 #31
0
    def is_memory_enough(a):
        try:
            rest, total = driver.mem_get_info()
        except driver.LogicError: # child thread cannot use context from the main thread...
            # the following does not work yet

            from pycuda import tools
            import skcuda
            
            driver.init()
            context = tools.make_default_context() # try to make as new context, but cannot deactivate the old context stack
            device = context.get_device()
            skcuda.misc.init_context(device)
            rest, total = driver.mem_get_info()
            
        if (sys.getsizeof(a) * 2) < rest:
            return True
예제 #32
0
def count_triangles_cublas(adjacency_list):
    driver.init()
    context = tools.make_default_context()
    h = cublas.cublasCreate()
    n = len(adjacency_list)
    A = np.zeros([n,n], dtype=np.float64)
    for row_idx, neighbor_list in adjacency_list:
        A[row_idx, neighbor_list] = 1.0
    a_gpu = gpuarray.to_gpu(A)
    b_gpu = gpuarray.empty(A.shape, A.dtype)
    c_gpu = gpuarray.empty(A.shape, A.dtype)
    one = np.float64(1.0)
    zero = np.float64(0.0)
    cublas.cublasDsymm(h, 'L', 'U', n, n, one, a_gpu.gpudata, n, a_gpu.gpudata, n, zero, b_gpu.gpudata, n)
    cublas.cublasDsymm(h, 'L', 'U', n, n, one, a_gpu.gpudata, n, b_gpu.gpudata, n, zero, c_gpu.gpudata, n)
    trace = linalg.trace(c_gpu, h)
    cublas.cublasDestroy(h)
    context.detach()
    return int(trace/6)
예제 #33
0
    def __init__(self, devnum=None):
        # type: (int) -> None
        env = os.environ.get("SAS_OPENCL", "").lower()
        if devnum is None and env.startswith("cuda:"):
            devnum = int(env[5:])

        # Set the global context to the particular device number if one is
        # given, otherwise use the default context.  Perhaps this will be set
        # by an environment variable within autoinit.
        if devnum is not None:
            self.context = cuda.Device(devnum).make_context()
        else:
            self.context = make_default_context()

        ## Byte boundary for data alignment.
        #self.data_boundary = max(d.min_data_type_align_size
        #                         for d in self.context.devices)

        # Cache for compiled programs, and for items in context.
        self.compiled = {}
예제 #34
0
def suite():
    context = make_default_context()
    device = context.get_device()
    context.pop()

    s = TestSuite()
    s.addTest(test_integrate('test_trapz_float32'))
    s.addTest(test_integrate('test_trapz_complex64'))
    s.addTest(test_integrate('test_simps_float32'))
    s.addTest(test_integrate('test_simps_complex64'))
    s.addTest(test_integrate('test_trapz2d_float32'))
    s.addTest(test_integrate('test_trapz2d_complex64'))
    if misc.get_compute_capability(device) >= 1.3:
        s.addTest(test_integrate('test_trapz_float64'))
        s.addTest(test_integrate('test_trapz_complex128'))
        s.addTest(test_integrate('test_simps_float64'))
        s.addTest(test_integrate('test_simps_complex128'))
        s.addTest(test_integrate('test_trapz2d_float64'))
        s.addTest(test_integrate('test_trapz2d_complex128'))
    return s
예제 #35
0
def init(device=-1):
  global CONTEXT
  if CONTEXT is not None:
    return
  
  # MAGIC MAGIC
  from pycuda import driver
  driver.init()
  
  if device == -1:
    from pycuda.tools import make_default_context
    CONTEXT = make_default_context()
    device = CONTEXT.get_device()
  else:
    device = driver.Device(device % driver.Device.count())
    CONTEXT = device.make_context()
  
  print 'Starting up using device: %s:%s' % (device.name(), device.pci_bus_id()) 
  import atexit
  atexit.register(CONTEXT.detach)
  return CONTEXT
예제 #36
0
파일: __init__.py 프로젝트: allenbo/distnet
def init(device=-1):
  global CONTEXT
  if CONTEXT is not None:
    return

  # MAGIC MAGIC
  from pycuda import driver
  driver.init()

  if device == -1:
    from pycuda.tools import make_default_context
    CONTEXT = make_default_context()
    device = CONTEXT.get_device()
  else:
    device = driver.Device(device % driver.Device.count())
    CONTEXT = device.make_context()

  #print 'Starting up using device: %s:%s' % (device.name(), device.pci_bus_id())
  import atexit
  atexit.register(CONTEXT.detach)
  return CONTEXT
예제 #37
0
 def solve(self,a,b,max):
     #context and kernel initialisation
     util.log.info("Initialising CUDA device")
     self.ctx = ctools.make_default_context()
     self.ctx.push()
     self.kernels=SourceModule(self.r_kernels)
     
     #Memory
     d_a=cuda.mem_alloc(a.astype(self.type).nbytes)
     d_b=cuda.mem_alloc(b.astype(self.type).nbytes)
     cuda.memcpy_htod(d_a,a.astype(self.type))
     cuda.memcpy_htod(d_b,b.astype(self.type))
     h_b=np.empty_like(b.astype(self.type))
     self.go=time()
     
     #Go solve
     go=self.kernels.get_function("solve")
     go(d_a,d_b,block=(1,1,1),grid=(1,1))
     cuda.memcpy_dtoh(h_b,d_b)
     self.done=time()
     self.ctx.pop()
     self.ctx.detach()
     return h_b
예제 #38
0
def image_cuda(grids):
    """ Run 2d FFT to image each plane of grid array
    """

    from pyfft.cuda import Plan
    from pycuda.tools import make_default_context
    import pycuda.gpuarray as gpuarray
    import pycuda.driver as cuda

    nints, npixx, npixy = grids.shape

    cuda.init()
    context = make_default_context()
    stream = cuda.Stream()

    plan = Plan((npixx, npixy), stream=stream)

    grid_gpu = gpuarray.to_gpu(grids)
    for i in range(0, nints):
        plan.execute(grid_gpu[i], inverse=True)
    grids = grid_gpu.get()

    context.pop()
    return recenter(grids.real, (npixx//2, npixy//2))
예제 #39
0
def simpleFourierTest2D(N=2048):
    """
    Using PyFFT to call CUDA.

    :return:
    """
    from pyfft.cuda import Plan
    import pycuda.driver as cuda
    from pycuda.tools import make_default_context
    import pycuda.gpuarray as gpuarray
    import time

    cuda.init()
    context = make_default_context()
    stream = cuda.Stream()

    plan = Plan((N, N), dtype=np.complex64, stream=stream)
    x = np.ones((N, N), dtype=np.complex64)

    x_gpu = gpuarray.to_gpu(x)

    plan.execute(x_gpu)
    res = x_gpu.get()
    plan.execute(x_gpu, inverse=True)
    result = x_gpu.get()
    context.pop()

    error = np.abs(np.sum(np.abs(x) - np.abs(result)) / x.size)
    #print 'Error:', error

    #Single precision
    print 'Array size %i x %i' % (N, N)
    print 'Single Precisions'
    x = np.random.random((N, N))
    x = x.astype(np.complex64)

    start = time.time()
    cuda.init()
    context = make_default_context()
    stream = cuda.Stream()

    plan = Plan((N, N), dtype=np.complex64, stream=stream, fast_math=True)

    x_gpu = gpuarray.to_gpu(x)
    plan.execute(x_gpu)
    result = x_gpu.get()
    context.pop()
    end = time.time()
    cudatime = end - start

    #numpy
    start = time.time()
    xf = np.fft.fft2(x)
    end = time.time()
    numpytime = end - start

    print 'Same to 1e-2?'
    print np.testing.assert_allclose(xf, result, rtol=1e-2)
    print 'Numpy time', numpytime
    print 'CUDA time', cudatime

    #Double precision
    print '\n\nDouble Precision'
    x = np.random.random((N, N))
    x = x.astype(np.complex128)

    start = time.time()

    cuda.init()
    context = make_default_context()
    stream = cuda.Stream()

    plan = Plan((N, N), dtype=np.complex128, stream=stream, fast_math=True)

    x_gpu = gpuarray.to_gpu(x)
    plan.execute(x_gpu)
    result = x_gpu.get()
    context.pop()

    end = time.time()
    cudatime = end - start

    #numpy
    start = time.time()
    xf = np.fft.fft2(x)
    end = time.time()
    numpytime = end - start

    print 'Same to 1e-7?'
    print np.testing.assert_allclose(xf, result, rtol=1e-7)
    print 'Numpy time', numpytime
    print 'CUDA time', cudatime
예제 #40
0
 def setUpClass(cls):
     cls.ctx = make_default_context()
예제 #41
0
파일: ssf.py 프로젝트: cvarin/PyOFTK
def ssfgpuStd(u0,
              dt,
              dz,
              nz,
              alpha,
              betap,
              gamma,
              maxiter=4,
              tol=1e-5,
              phiNLOut=False):
    '''	
	Very simple implementation of the symmetrized split-step fourier algo.
	Solve the NLS equation with the SPM nonlinear terme only.

		* error: third in step size
		* u0 : Input field
		* dt: Time increment
		* dz: Space increment
		* nz: Number of space propagation step
		* alpha: Loss/Gain parameter (array)
		* betap: Beta array beta[2] = GVD, beta[3] = TOD, etc...
		* gamma: Nonlinear parameter
		* maxiter: Maximal number of iteration per step (4)
		* tol: Error for each step (1e-5)
		* phiNLOut: If True return the nonlinear phase shift (True)

		--- GPU Version (float precision) ---
	'''

    nt = len(u0)
    e_ini = pow(abs(u0), 2).sum()
    w = wspace(dt * nt, nt)
    phiNL = 0.0

    # Make sure u0 is in single precision
    u0 = u0.astype(complex64)
    alpha = alpha.astype(complex64)
    u1 = u0
    uv = empty_like(u0)

    # Construction of the linear operator
    halfstep = -alpha / 2.0
    if len(betap) != nt:
        for ii in arange(len(betap)):
            halfstep = halfstep - 1.0j * betap[ii] * pow(w, ii) / factorial(ii)
    halfstep = exp(halfstep * dz / 2.0).astype(complex64)

    # CUDA Kitchen sink
    cuda.init()
    context = make_default_context()
    fftPlan = Plan((1, nt), dtype=numpy.complex64)

    # Allocate memory to the device
    gpu_halfstep = gpuarray.to_gpu(halfstep)
    gpu_u0 = gpuarray.to_gpu(u0)
    gpu_u1 = gpuarray.to_gpu(u1)
    gpu_uhalf = gpuarray.empty_like(gpu_u0)
    gpu_uv = gpuarray.empty_like(gpu_u0)
    gpu_ufft = gpuarray.empty_like(gpu_u0)

    fftPlan.execute(gpu_u0, gpu_ufft)

    # GPU Kernel corresponding to the linear operator
    halfStepKernel = ElementwiseKernel(
        "pycuda::complex<float> *u, pycuda::complex<float> *halfstep, pycuda::complex<float> *uhalf",
        "uhalf[i] = u[i] * halfstep[i]",
        "halfstep_linear",
        preamble="#include <pycuda-complex.hpp>",
    )

    # GPU Kernel corresponding to the nonlinear operator
    nlKernel = ElementwiseKernel(
        "pycuda::complex<float> *uhalf, pycuda::complex<float> *u0, pycuda::complex<float> *u1, pycuda::complex<float> *uv, float gamma, float dz",
        """
		float u0_int = pow(u0[i]._M_re,2) + pow(u0[i]._M_im,2);
		float u1_int = pow(u1[i]._M_re,2) + pow(u1[i]._M_im,2);
		float realArg = -gamma*(u1_int + u0_int)*dz;
		float euler1 = cos(realArg);
		float euler2 = sin(realArg);
		uv[i]._M_re = uhalf[i]._M_re * euler1 - uhalf[i]._M_im * euler2;
		uv[i]._M_im = uhalf[i]._M_im * euler1 + uhalf[i]._M_re * euler2;
		""",
        "halfstep_nonlinear",
        preamble="#include <pycuda-complex.hpp>",
    )

    # GPU reduction kernel computing the error between two complex array
    computeError = ReductionKernel(
        numpy.float32,
        neutral="0",
        reduce_expr="a+b",
        map_expr="pow(abs(a[i] - b[i]),2)",
        arguments="pycuda::complex<float> *a, pycuda::complex<float> *b",
        name="error_reduction",
        preamble="#include <pycuda-complex.hpp>",
    )

    # Perfom a deep copy of a complex gpuarray
    complexDeepCopy = ElementwiseKernel(
        "pycuda::complex<float> *u1, pycuda::complex<float> *u2",
        "u1[i]._M_re = u2[i]._M_re;u1[i]._M_im = u2[i]._M_im",
        "gpuarray_deepcopy",
        preamble="#include <pycuda-complex.hpp>",
    )

    # Main Loop
    for iz in arange(nz):
        # First application of the linear operator
        halfStepKernel(gpu_ufft, gpu_halfstep, gpu_uhalf)
        fftPlan.execute(gpu_uhalf, inverse=True)
        for ii in arange(maxiter):
            # Application de l'operateur nonlineaire en approx. l'integral de N(z)dz
            # avec la methode du trapeze
            nlKernel(gpu_uhalf, gpu_u0, gpu_u1, gpu_uv, float(gamma),
                     float(dz / 2.0))
            fftPlan.execute(gpu_uv)
            # Second application of the linear operator
            halfStepKernel(gpu_uv, gpu_halfstep, gpu_ufft)
            fftPlan.execute(gpu_ufft, gpu_uv, inverse=True)

            error = computeError(gpu_u1, gpu_uv).get() / e_ini

            if (error < tol):
                complexDeepCopy(gpu_u1, gpu_uv)
                break
            else:
                complexDeepCopy(gpu_u1, gpu_uv)

        if (ii >= maxiter - 1):
            context.pop()
            raise Exception, "Failed to converge"

        complexDeepCopy(gpu_u0, gpu_u1)

    u1 = gpu_u1.get()
    context.pop()

    if phiNLOut:
        return [u1, phiNL]
    else:
        return u1
예제 #42
0
def suite():
    context = make_default_context()
    device = context.get_device()
    context.pop()

    s = TestSuite()
    s.addTest(test_cublas('test_cublasIsamax'))
    s.addTest(test_cublas('test_cublasIcamax'))
    s.addTest(test_cublas('test_cublasIsamin'))
    s.addTest(test_cublas('test_cublasIcamin'))
    s.addTest(test_cublas('test_cublasSasum'))
    s.addTest(test_cublas('test_cublasScasum'))
    s.addTest(test_cublas('test_cublasSaxpy'))
    s.addTest(test_cublas('test_cublasCaxpy'))
    s.addTest(test_cublas('test_cublasScopy'))
    s.addTest(test_cublas('test_cublasCcopy'))
    s.addTest(test_cublas('test_cublasSdot'))
    s.addTest(test_cublas('test_cublasCdotu'))
    s.addTest(test_cublas('test_cublasCdotc'))
    s.addTest(test_cublas('test_cublasSrnm2'))
    s.addTest(test_cublas('test_cublasScrnm2'))
    s.addTest(test_cublas('test_cublasSscal'))
    s.addTest(test_cublas('test_cublasCscal'))
    s.addTest(test_cublas('test_cublasSrot'))
    s.addTest(test_cublas('test_cublasSswap'))
    s.addTest(test_cublas('test_cublasCswap'))
    s.addTest(test_cublas('test_cublasSgemv'))
    s.addTest(test_cublas('test_cublasCgemv'))
    s.addTest(test_cublas('test_cublasSgeam'))
    s.addTest(test_cublas('test_cublasCgeam'))
    s.addTest(test_cublas('test_cublasSgemmBatched'))
    s.addTest(test_cublas('test_cublasCgemmBatched'))
    s.addTest(test_cublas('test_cublasStrsmBatched'))
    s.addTest(test_cublas('test_cublasSgetrfBatched'))
    if misc.get_compute_capability(device) >= 1.3:
        s.addTest(test_cublas('test_cublasIdamax'))
        s.addTest(test_cublas('test_cublasIzamax'))
        s.addTest(test_cublas('test_cublasIdamin'))
        s.addTest(test_cublas('test_cublasIzamin'))
        s.addTest(test_cublas('test_cublasDasum'))
        s.addTest(test_cublas('test_cublasDzasum'))
        s.addTest(test_cublas('test_cublasDaxpy'))
        s.addTest(test_cublas('test_cublasZaxpy'))
        s.addTest(test_cublas('test_cublasDcopy'))
        s.addTest(test_cublas('test_cublasZcopy'))
        s.addTest(test_cublas('test_cublasDdot'))
        s.addTest(test_cublas('test_cublasZdotu'))
        s.addTest(test_cublas('test_cublasZdotc'))
        s.addTest(test_cublas('test_cublasDrnm2'))
        s.addTest(test_cublas('test_cublasDzrnm2'))
        s.addTest(test_cublas('test_cublasDscal'))
        s.addTest(test_cublas('test_cublasZscal'))
        s.addTest(test_cublas('test_cublasZdscal'))
        s.addTest(test_cublas('test_cublasDswap'))
        s.addTest(test_cublas('test_cublasZswap'))
        s.addTest(test_cublas('test_cublasDgemv'))
        s.addTest(test_cublas('test_cublasZgemv'))
        s.addTest(test_cublas('test_cublasDgeam'))
        s.addTest(test_cublas('test_cublasZgeam'))
        s.addTest(test_cublas('test_cublasDgemmBatched'))
        s.addTest(test_cublas('test_cublasZgemmBatched'))
        s.addTest(test_cublas('test_cublasDtrsmBatched'))
        s.addTest(test_cublas('test_cublasDgetrfBatched'))
    return s
예제 #43
0
 def setUpClass(cls):
     cls.ctx = make_default_context()
예제 #44
0
파일: ssf.py 프로젝트: cvarin/PyOFTK
def ssfgpuStd(u0, dt, dz, nz, alpha, betap, gamma, maxiter = 4, tol = 1e-5, phiNLOut = False):

	'''	
	Very simple implementation of the symmetrized split-step fourier algo.
	Solve the NLS equation with the SPM nonlinear terme only.

		* error: third in step size
		* u0 : Input field
		* dt: Time increment
		* dz: Space increment
		* nz: Number of space propagation step
		* alpha: Loss/Gain parameter (array)
		* betap: Beta array beta[2] = GVD, beta[3] = TOD, etc...
		* gamma: Nonlinear parameter
		* maxiter: Maximal number of iteration per step (4)
		* tol: Error for each step (1e-5)
		* phiNLOut: If True return the nonlinear phase shift (True)

		--- GPU Version (float precision) ---
	'''	

	nt = len(u0)
	e_ini = pow(abs(u0),2).sum()
	w = wspace(dt*nt,nt)
	phiNL = 0.0

	# Make sure u0 is in single precision
	u0=u0.astype(complex64)
	alpha=alpha.astype(complex64)
	u1 = u0
	uv = empty_like(u0)

	# Construction of the linear operator
	halfstep = -alpha/2.0	
	if len(betap) != nt:
		for ii in arange(len(betap)):
			halfstep = halfstep - 1.0j*betap[ii]*pow(w,ii)/factorial(ii)
	halfstep = exp(halfstep*dz/2.0).astype(complex64)

	# CUDA Kitchen sink
	cuda.init()
	context = make_default_context()
	fftPlan = Plan((1, nt), dtype=numpy.complex64)

	# Allocate memory to the device
	gpu_halfstep = gpuarray.to_gpu(halfstep)
	gpu_u0 = gpuarray.to_gpu(u0)
	gpu_u1 = gpuarray.to_gpu(u1)
	gpu_uhalf = gpuarray.empty_like(gpu_u0)
	gpu_uv = gpuarray.empty_like(gpu_u0)
	gpu_ufft = gpuarray.empty_like(gpu_u0)
		
	fftPlan.execute(gpu_u0, gpu_ufft)
	
	# GPU Kernel corresponding to the linear operator
	halfStepKernel = ElementwiseKernel("pycuda::complex<float> *u, pycuda::complex<float> *halfstep, pycuda::complex<float> *uhalf",
		"uhalf[i] = u[i] * halfstep[i]",
		"halfstep_linear",
		preamble="#include <pycuda-complex.hpp>",)
	
	# GPU Kernel corresponding to the nonlinear operator
	nlKernel = ElementwiseKernel("pycuda::complex<float> *uhalf, pycuda::complex<float> *u0, pycuda::complex<float> *u1, pycuda::complex<float> *uv, float gamma, float dz",
		"""
		float u0_int = pow(u0[i]._M_re,2) + pow(u0[i]._M_im,2);
		float u1_int = pow(u1[i]._M_re,2) + pow(u1[i]._M_im,2);
		float realArg = -gamma*(u1_int + u0_int)*dz;
		float euler1 = cos(realArg);
		float euler2 = sin(realArg);
		uv[i]._M_re = uhalf[i]._M_re * euler1 - uhalf[i]._M_im * euler2;
		uv[i]._M_im = uhalf[i]._M_im * euler1 + uhalf[i]._M_re * euler2;
		""",
		"halfstep_nonlinear",
		preamble="#include <pycuda-complex.hpp>",)
	
	# GPU reduction kernel computing the error between two complex array
	computeError = ReductionKernel(numpy.float32, neutral="0",
		reduce_expr="a+b", map_expr="pow(abs(a[i] - b[i]),2)",
		arguments="pycuda::complex<float> *a, pycuda::complex<float> *b",
		name="error_reduction",
		preamble="#include <pycuda-complex.hpp>",)

	# Perfom a deep copy of a complex gpuarray
	complexDeepCopy = ElementwiseKernel("pycuda::complex<float> *u1, pycuda::complex<float> *u2",
		"u1[i]._M_re = u2[i]._M_re;u1[i]._M_im = u2[i]._M_im",
		"gpuarray_deepcopy",
		preamble="#include <pycuda-complex.hpp>",)
	
	# Main Loop
	for iz in arange(nz):
		# First application of the linear operator
		halfStepKernel(gpu_ufft, gpu_halfstep, gpu_uhalf)
		fftPlan.execute(gpu_uhalf, inverse=True)
		for ii in arange(maxiter):
			# Application de l'operateur nonlineaire en approx. l'integral de N(z)dz
			# avec la methode du trapeze 
			nlKernel(gpu_uhalf, gpu_u0, gpu_u1, gpu_uv, float(gamma), float(dz/2.0))
			fftPlan.execute(gpu_uv)
			# Second application of the linear operator
			halfStepKernel(gpu_uv, gpu_halfstep, gpu_ufft)
			fftPlan.execute(gpu_ufft, gpu_uv, inverse=True)

			error = computeError(gpu_u1, gpu_uv).get() / e_ini

			if (error < tol):
				complexDeepCopy(gpu_u1, gpu_uv)
				break
			else:
				complexDeepCopy(gpu_u1, gpu_uv)
		
		if (ii >= maxiter-1):
			context.pop()
			raise Exception, "Failed to converge"

		complexDeepCopy(gpu_u0, gpu_u1)

	u1 = gpu_u1.get()
	context.pop()

	if phiNLOut:
		return [u1, phiNL]
	else:
		return u1
예제 #45
0
 def setUpClass(cls):
     cls.ctx = make_default_context()
     cls.cublas_handle = cublas.cublasCreate()
예제 #46
0
def split_step_GPU(A0, z_array,       # Array for solution points
                t_op = 0, w_op = 0, nlin = 0,  # Constant operators 
                dt = 1,           # sampling time
                t_nl_op = None,   # Additional operator f(A, dt, z)
                apod = True,      # Boundary conditition
                varying_operator = False, # Do operators vary in x
                dynamic_predictor = True,
                plot_hook = None, n_plots = 3,  # not used anymore
                tollerance = 0.04, ):
    
    import pycuda.autoinit

    from pycuda.tools import make_default_context, dtype_to_ctype
    import pycuda.gpuarray as gpuarray
    from pycuda import cumath
    from pyfft.cuda import Plan
    from pycuda.compiler import SourceModule
    from pycuda.driver import Context
    from pycuda.elementwise import get_axpbyz_kernel, get_axpbz_kernel, get_binary_op_kernel, get_elwise_kernel,ElementwiseKernel
            
    ## Initialization
    n_points = A0.shape[0]
    # w = fftfreq(npoints, dx) * 2 * pi
    A_t = A0[:] +0.j
    #A_t.dtype = complex64
    A_w = fft(A_t) * dt
    
    
    # Apodization (AK boundary conditions)
    # TODO making it smooth
    apod_array = ones(n_points, dtype = complex64)
    apod_array[0:n_points/50] = 0
    apod_array[-n_points/50:-1] = 0

    z0 = z_array[0]
    zf = z_array[-1]
    
    delta_z = 1.*(z_array[1]-z_array[0])/4
    done_once = False

    #plan = c_uint()
    #dll.cufftPlan1d(byref(plan), n_points, 0x29, 1)
    #fft_g  = lambda x, y: dll.cufftExecC2C(plan, x.ptr, y.ptr, -1)
    #ifft_g = lambda x, y: dll.cufftExecC2C(plan, x.ptr, y.ptr, 1)
    
    ## GPU modules #####
    if pycuda.autoinit.context:
        context = pycuda.autoinit.context
    else:
        context =  make_default_context()
    block = (16,1,1)
    grid  = (n_points/block[0], 1)

    ## Init GPU kernels ####
    ## fft, scale dx is included in the definition here
    plan = Plan(n_points,wait_for_finish = True, scale = dt)   
    fft_g  = lambda ain, aout: plan.execute(ain, aout,)
    ifft_g = lambda x, y: plan.execute(x, y, inverse = True)

    ## Multiplication
    prod  = ElementwiseKernel(
            "pycuda::complex<float> *x, pycuda::complex<float> *y, pycuda::complex<float> *z",
            """
            z[i] = x[i] * y[i];
            """,
            "product",
            preamble = "")
    #prod  = lambda x,y,z: prod(x,y,z, block, grid)
    
    ## Non-linearity
    nonLinear = ElementwiseKernel(
            "pycuda::complex<float> *x, pycuda::complex<float> nlin, pycuda::complex<float> *y, pycuda::complex<float> *z",
            """
            pycuda::complex<float> I_UNIT(0.,1.);
            float I = pycuda::abs(y[i]);
            z[i] = x[i] * pycuda::exp(I_UNIT * I * nlin);
            """,
            "nonLinear",
            preamble = "")
    
    ## Evaluate the solution with current values at delta_z step
    ## separated so that can be re-used for error prediction
    ## contains some lazy eveluation just to be CUDA-implementation ready
    ## and reducing the number of array creation
    
    def f(A_t, A_w, dz = delta_z):
        if f.delta_z != dz:
            f.w_exp = cumath.exp(-1j * dz/2. * w_op)
            f.t_exp = cumath.exp(-1j * dz * t_op)
            f.delta_z = dz
        
        ## Dispersion (I pass)
        f.A_t = A_t
        f.A_w = A_w
        
        #print A_w.get()[n_points/2],     
        prod(A_w, f.w_exp, A_w)
        #A_w = f.w_exp*A_w
        #print A_w.get()[n_points/2],
        ifft_g(f.A_w, f.A_t)  ## Scale factor included in fft_g
        
        
        ## Constant potential term
        prod(f.A_t, f.t_exp, f.A_t)

        ## Nonlinear operator as intensity dependency
        if nlin != 0:
            f.A_t = f.A_t * cumath.exp(-1j * delta_z * nlin * f.A_t * f.A_t.conj())
        ## Additional nonlinear terms as a function t_nl_op(A(t),dt,z)
        if t_nl_op != None:
            f.A_t = f.A_t * cumath.exp(-1j * delta_z * t_nl_op(f.A_t, dt, z0+delta_z/2) )
        ## Apodization
        if apod:
            prod(f.A_t, apod_array, f.A_t)
            
        fft_g(f.A_t, f.A_w) ## Scale factor included in fft_g
        
        ## Dispersion (II pass)
        prod(f.A_w, f.w_exp, f.A_w)
        
        ifft_g(f.A_w, f.A_t)  ## Scale factor included in fft_g
        
        
        return f.A_t, f.A_w

    ## Init the f function
    f.delta_z = 0 # The rest will be evaluated lazily

    ## Convert to GPU arrays
    f.A_t = gpuarray.to_gpu(ones(n_points, complex64))
    f.A_w = gpuarray.to_gpu(ones(n_points, complex64))
    A_t   = gpuarray.to_gpu(A_t.astype(complex64))
    A_w   = gpuarray.to_gpu(A_w.astype(complex64))
    
    
    
    apod_array  = gpuarray.to_gpu(apod_array.astype(complex64))
    if hasattr(w_op,'__len__'):
        w_op = gpuarray.to_gpu(w_op.astype(complex64))
    else:  ## Use array even if it's a single values, othewise error when updating dz
        w_op = gpuarray.to_gpu(w_op*ones(n_points).astype(complex64))
    if hasattr(t_op,'__len__'):
        t_op = gpuarray.to_gpu(t_op.astype(complex64))
    else:
        t_op = gpuarray.to_gpu(t_op*ones(n_points).astype(complex64))
    error = tollerance
    
    print "Ready for integration"
    
    ## Init loop variables
    sol_i = 0    
    sols  = [A0]
    iters = 0
    
    ## Integration loop
    while z0 <= zf:
        ## Cycle check
        if z0 >= z_array[sol_i]:
            #print "dz = %.2e error=%.2f z = %.2e"%(delta_z,error,z0)
            sols.append(A_t.get())
            sol_i +=1
            
        try:  ## Force to have steps smaller than the distance between 2 solutions
            while z0 + delta_z >= z_array[sol_i + 1]:
                delta_z /= 2.
        except:
            pass
    
        ## Dynamical correction
        while dynamic_predictor:
            A_coarse = f(gpuarray.to_gpu(A_t.get()),
                         gpuarray.to_gpu(A_w.get()),
                         dz=2*delta_z)[0].get()
            A_fine = f(*f(gpuarray.to_gpu(A_t.get()),
                          gpuarray.to_gpu(A_w.get()),
                         delta_z), dz=delta_z)[0].get()
            delta = A_fine-A_coarse
            error = sqrt( trapz(delta*delta.conj())/ \
                          trapz(A_fine*A_fine.conj()))
            #print "Error : ",error, " dz :", delta_z
            if error < 2 * tollerance:
                done_once = True
                break  ## Error is less then the tollerance, proceed
            delta_z = delta_z / 2.
        
        # update step        
        A_t, A_w = f(A_t, A_w, delta_z)
        z0 += delta_z
        iters += 1
        
        # Dynamic step (additional correction for faster convergence)
        if (dynamic_predictor or not (done_once or dynamic_predictor) ):
            if error > tollerance:
                delta_z = delta_z / 1.23
            if error < 0.5/tollerance:
                delta_z = delta_z * 1.23
                
        # Show the state of the loop every 200 loops (approx every few secs)
        if iters %200 == 0:
            print "Iter %8d (to end %8d) %4.1f %%"%(iters, 
                            (z_array[-1]-z0)/delta_z,
                            100.*iters/(iters+(z_array[-1]-z0)/delta_z))
    
    ## Integration is over
    print "Total iterations: ", iters
    ## Return array with solutions (and their ftt)
    return sols
예제 #47
0
    def __init__(self, cfg, scene, data_writer, receiver_files, output_fn):
        import pycuda.driver as cuda
        from pycuda.tools import make_default_context
        from pycuda.compiler import SourceModule
        from pycuda.driver import Function
        from pyfft.cuda import Plan

        cuda.init()
        context = make_default_context()
        stream = cuda.Stream()

        plan_set = {}  # will be filled as needed in spatderp3_gpu(~), prefill with 128, 256 and 512
        plan_set[str(128)] = Plan(128, dtype=np.float64, context=context, stream=stream, fast_math=False)
        plan_set[str(256)] = Plan(256, dtype=np.float64, context=context, stream=stream, fast_math=False)
        plan_set[str(512)] = Plan(512, dtype=np.float64, context=context, stream=stream, fast_math=False)

        g_bufl = (
            {}
        )  # m/d(r/i) -> windowed matrix/derfact real/imag buffers. m(1/2/3)->p(#) buffers. spatderp3 will expand them if needed
        g_bufl["mr"] = cuda.mem_alloc(8 * 128 * 128)
        g_bufl["mi"] = cuda.mem_alloc(8 * 128 * 128)
        g_bufl["m1"] = cuda.mem_alloc(8 * 128 * 128)
        g_bufl["m2"] = cuda.mem_alloc(8 * 128 * 128)
        g_bufl["m3"] = cuda.mem_alloc(8 * 128 * 128)
        g_bufl["m_size"] = 8 * 128 * 128
        g_bufl["dr"] = cuda.mem_alloc(8 * 128)  # dr also used by A (window matrix)
        g_bufl["di"] = cuda.mem_alloc(8 * 128)
        g_bufl["d_size"] = 8 * 128

        kernelcode = SourceModule(
            """ 
              #include <stdio.h>
              __global__ void derifact_multiplication(double *matr, double *mati, double *vecr, double *veci, int fftlen, int fftnum)
              {
                int index_x = blockIdx.x*blockDim.x + threadIdx.x; 
                int index_y = blockIdx.y*blockDim.y + threadIdx.y;

                int matindex = index_y*fftlen+index_x; //mat should be a contiguous array
                //printf("Block(x,y): (%d,%d). Thread(x,y): (%d,%d)\\n",blockIdx.x,blockIdx.y,threadIdx.x,threadIdx.y);
                // if N1%16>0, we're starting too many threads.
                // There is probably a better way to do this, but just eating the surplus should work.
                if (matindex < fftlen*fftnum) {
                    double matreal = matr[matindex];
                    double matimag = mati[matindex];
                    double vecreal = vecr[index_x];
                    double vecimag = veci[index_x];

                    matr[matindex] = matreal*vecreal - matimag*vecimag;
                    mati[matindex] = matreal*vecimag + matimag*vecreal;
                }
              }

              __global__ void pressure_window_multiplication(double *mr, double *mi, double *A, double *p1, double *p2, double *p3, int winlen, int Ns1, int Ns2, int Ns3, int fftlen, int fftnum, double R21, double R00, double R31, double R10) //passing a few by value seems to be more efficient than building an array first in pycuda
              {
                int index_x = blockIdx.x*blockDim.x + threadIdx.x; 
                int index_y = blockIdx.y*blockDim.y + threadIdx.y;

                int matindex = index_y*fftlen+index_x;

                double G = 1;
                if (index_x < winlen) {
                    G = A[index_x];
                } else if (index_x > winlen+Ns2-1 && index_x < winlen*2+Ns2) {
                    G = A[index_x-Ns2];
                }
                if (index_y < fftnum) { //eat the surplus
                    mi[matindex] = 0;
                    if (index_x < winlen) {
                        mr[matindex] = G*(R21*p1[Ns1*index_y+index_x-winlen+Ns1] + R00*p2[Ns2*index_y+winlen-1-index_x]);
                    } else if (index_x < winlen + Ns2) {
                        mr[matindex] = p2[Ns2*index_y+index_x-winlen];
                    } else if (index_x < winlen*2+Ns2) {
                        mr[matindex] = G*(R31*p3[Ns3*index_y+index_x-winlen-Ns2] + R10*p2[Ns2*index_y+2*Ns2+winlen-1-index_x]);
                    } else {
                        mr[matindex] = 0; //zero padding
                    }
                    //if(mr[matindex]==0 && matindex < 50) printf("zero at:%d\\n",matindex%fftlen);
                }
              }

              __global__ void velocity_window_multiplication(double *mr, double *mi, double *A, double *p1, double *p2, double *p3, int winlen, int Ns1, int Ns2, int Ns3, int fftlen, int fftnum, double R21, double R00, double R31, double R10) //passing a few by value seems to be more efficient than building an array first in pycuda
              {
                int index_x = blockIdx.x*blockDim.x + threadIdx.x; 
                int index_y = blockIdx.y*blockDim.y + threadIdx.y;

                int matindex = index_y*fftlen+index_x;

                double G = 1;
                if (index_x < winlen) {
                    G = A[index_x];
                } else if (index_x > winlen+Ns2-1 && index_x < winlen*2+Ns2) {
                    G = A[index_x-Ns2];
                }
                if (index_y < fftnum) { //eat the surplus
                    mi[matindex] = 0;
                    if (index_x < winlen) {
                        mr[matindex] = G*(R21*p1[Ns1*index_y+index_x-winlen+Ns1-1] + R00*p2[Ns2*index_y+winlen-index_x]);
                    } else if (index_x < winlen + Ns2) {
                        mr[matindex] = p2[Ns2*index_y+index_x-winlen];
                    } else if (index_x < winlen*2+Ns2) {
                        mr[matindex] = G*(R31*p3[Ns3*index_y+index_x-winlen-Ns2+1] + R10*p2[Ns2*index_y+2*Ns2+winlen-2-index_x]);
                    } else {
                        mr[matindex] = 0; //zero padding
                    }
                    //if(mr[matindex]==0 && matindex < 50) printf("zero at:%d\\n",matindex%fftlen);
                }
              }
              """
        )
        mulfunc = {}
        mulfunc["pres_window"] = kernelcode.get_function("pressure_window_multiplication")
        mulfunc["velo_window"] = kernelcode.get_function("velocity_window_multiplication")
        mulfunc["derifact"] = kernelcode.get_function("derifact_multiplication")

        # Loop over time steps
        for frame in range(int(cfg.TRK)):
            output_fn({"status": "running", "message": "Calculation frame:%d" % (frame + 1), "frame": frame + 1})

            # Keep a reference to current matrix contents
            for domain in scene.domains:
                domain.push_values()

            # Loop over subframes
            for sub_frame in range(6):
                # Loop over calculation directions and measures
                for domain in scene.domains:
                    calc_domain_gpu(domain, context, stream, plan_set, g_bufl, mulfunc)
                # Update acoustic values
                for domain in scene.domains:
                    if not domain.is_rigid():
                        update_domain(domain, sub_frame)

                # Sum the pressure components
                for domain in scene.domains:
                    domain.field_dict["p0"] = domain.field_dict["px0"] + domain.field_dict["pz0"]

            # Apply pml matrices to boundary domains
            scene.apply_pml_matrices()

            for rf, r in zip(receiver_files, scene.receivers):
                rf.write(struct.pack("f", r.calc()))
                rf.flush()

            if frame % cfg.save_nth_frame == 0:
                data_writer.write_to_file(frame)

        context.pop()
예제 #48
0
 def setUpClass(cls):
     cls.ctx = make_default_context()
     integrate.init()
예제 #49
0
def main():
    ## Default input parameter specification
    r=1.0
    nz=100
    G=1.8962

    print("Starting\n")
    start_time = time.time()

    print("Creating Initial Profile\n")
    ##simulation parameter
    n_points = (1024,1024)
    Xmax= (5.0,5.0) #  grid and window
    dx = [2.*Xmax[i]/n_points[i] for i in [0,1]]

    dz=0.003
    beta = 500

    #print "Enter step size [Ldf]\n" 
	#scanf("%lf",&dz 
    #print "Enter number of steps \n" 
	#scanf("%lf",&nz 
	#nz=(int)nz;
	#print "Enter number of critical powers\n" 
	#scanf("%lf",&beta 
    gamma=G*beta

    x=linspace(-Xmax[0],Xmax[0],n_points[0])
    y=linspace(-Xmax[1],Xmax[1],n_points[1])
    
    kx=fftfreq(n_points[0],dx[0])
    ky=fftfreq(n_points[1],dx[1])
    
    X,Y = meshgrid(x,y)
    Kx, Ky = meshgrid(kx,ky)
    
    keepMax=zeros(N_Z)

    II_out= zeros(n_points)
    U_m = zeros(n_points, dtype = complex64)
    IM_out= zeros(n_points)
    IF_out= zeros(n_points)
    ufft= zeros(n_points, dtype = complex64)
    ufft_pc= zeros(n_points, dtype = complex64)

	##for (j=0;j<nx;j++) {
	##	x[j]=(double)(-nx/2+j+1)*dx;
	##	kx[j]=(j < nx/2 ) ?
    ##             (pi*(double)(j))*(1./dx/((double)(nx))):
    ##             (pi*(double)(j-nx))*(1./dx/((double)(nx))    ##fx_s=1/dx, dfx= fx_s/N-> d omega= 2pi dfx		
	## ?????	kx[j]=kx[j]*kx[j];
	
    u = exp( -(X**2 + Y**2)/r ) + 0.j
    u = u.astype(complex64)
    
    u_m= zeros(n_points)
    u_f= zeros(n_points)
    
    II_out = (u.real**2 + u.imag**2)  ## ???
	
    steps=2.*nz

    print "Step size %g [Ldf]\n"%dz
    print "Number of critical powers %g\n"%beta
    print "Number of steps %d\n"%N_Z


    ## cuFFT planning and preparation
    # cuda.init()
    dev = pycuda.autoinit.device
    context = make_default_context()
    
    nonlinearMod = SourceModule("""
 #include <pycuda-complex.hpp>

__global__ void nonlinear(pycuda::complex<float> *u_mat, float beta, float dz, pycuda::complex<float> *keepMax, int step)
{
		const int y = blockDim.y * blockIdx.y + threadIdx.y;
		const int x = blockDim.x * blockIdx.x + threadIdx.x;
		float I;
        pycuda::complex<float> I_UNIT(0.,1.);
        int i = x* %(n)d + y;
        
		I=pycuda::abs(u_mat[i]);

		u_mat[i]= u_mat[i]* pycuda::exp(I_UNIT*I*beta*dz);
		
		if ((x==i/2) && (y==i/2))
			keepMax[step]=pycuda::exp(I_UNIT*I*beta*dz);
	
}
    
__global__ void prod(pycuda::complex<float> *X,
                     pycuda::complex<float> *Y, pycuda::complex<float> *Z)
{
		const int y = blockDim.y * blockIdx.y + threadIdx.y;
		const int x = blockDim.x * blockIdx.x + threadIdx.x;
        int i = x*%(n)d + y;
		Z[i]=X[i] * Y[i];
	
    }"""%{'n' : n_points[0]})

    
    print "Device %d: \"%s\" with Compute %d.%d capability\n"%(dev.pci_bus_id,
                                                               dev.name(),
                                                               dev.compute_capability()[0],
                                                               dev.compute_capability()[1])
    print "Creating FFT Plans\n"
	
    
    plan = Plan(n_points, wait_for_finish = True, scale = dx[0]*dx[1])
    block = (16,16,1)
    grid = (n_points[0]/block[0],
            n_points[1]/block[1])   ## Threads per block
    fft_g  = lambda x, y: plan.execute(x, y)
    ifft_g = lambda x, y: plan.execute(x, y, inverse = True)
    
    g_mult = nonlinearMod.get_function('prod')
    runNonLinear = nonlinearMod.get_function("nonlinear")
    
    print "Allocating memory on device\n"
    u_gpu = gpuarray.to_gpu(u.astype(complex64))
    U_gpu = gpuarray.to_gpu(zeros(n_points, complex64))


    print "Allocating kx, ky & keepMax\n" 
    cukx = gpuarray.to_gpu(kx)
    cuky = gpuarray.to_gpu(ky)
    cukeepMax = gpuarray.to_gpu(ones(nz, complex64))
 
	## preparing the data to transfer to the device 
    
    IM_out = u.real
    #fileout("A",0.,h_in,nx, ny) 
    
    print "Starting %i FFT pairs\n"%steps
    start = time.time()

    op_diff = exp(5e2j*(Kx**2+Ky**2) *dz/2.)
    op_diff = gpuarray.to_gpu(op_diff.astype(complex64))
    
    zero_j = array([0],dtype = complex64)
    one_j = array([1],dtype = complex64)
    idxdy = array([1./(dx[0]*dx[1])], dtype = complex64)
    dxdy = array([(dx[0]*dx[1])], dtype = complex64)
    
    g_mult(U_gpu, op_diff, U_gpu, block = block, grid = grid)
    context.synchronize()
    #print abs(U_gpu.get())
    #pl.imshow(abs(U_gpu.get()))
    #pl.figure()
    
    for l in xrange(nz):
		## FFT into the spatial frequency domain
        fft_g(u_gpu, U_gpu)
        g_mult(U_gpu, op_diff, U_gpu, block = block, grid = grid)
        context.synchronize()
        
		## inverse FFT into space domain
        ifft_g(U_gpu, u_gpu)
        
    	
		## Nonlinear step in space domain
        runNonLinear(u_gpu, float32(gamma), float32(dz), cukeepMax, int32(l), block = block, grid = grid)
        context.synchronize()
		## cast to double
        fft_g(u_gpu, U_gpu)
        
        g_mult(U_gpu, op_diff, U_gpu, block = block, grid = grid)
        context.synchronize()
		## inverse FFT into space domain
        ifft_g(U_gpu, u_gpu)
예제 #50
0
def suite():
    context = make_default_context()
    device = context.get_device()
    context.pop()

    s = TestSuite()
    s.addTest(test_cublas('test_cublasIsamax'))
    s.addTest(test_cublas('test_cublasIcamax'))
    s.addTest(test_cublas('test_cublasIsamin'))
    s.addTest(test_cublas('test_cublasIcamin'))
    s.addTest(test_cublas('test_cublasSasum'))
    s.addTest(test_cublas('test_cublasScasum'))
    s.addTest(test_cublas('test_cublasSaxpy'))
    s.addTest(test_cublas('test_cublasCaxpy'))
    s.addTest(test_cublas('test_cublasScopy'))
    s.addTest(test_cublas('test_cublasCcopy'))
    s.addTest(test_cublas('test_cublasSdot'))
    s.addTest(test_cublas('test_cublasCdotu'))
    s.addTest(test_cublas('test_cublasCdotc'))
    s.addTest(test_cublas('test_cublasSrnm2'))
    s.addTest(test_cublas('test_cublasScrnm2'))
    s.addTest(test_cublas('test_cublasSscal'))
    s.addTest(test_cublas('test_cublasCscal'))
    s.addTest(test_cublas('test_cublasSrot'))
    s.addTest(test_cublas('test_cublasSswap'))
    s.addTest(test_cublas('test_cublasCswap'))
    s.addTest(test_cublas('test_cublasSgemv'))
    s.addTest(test_cublas('test_cublasCgemv'))
    s.addTest(test_cublas('test_cublasSgeam'))
    s.addTest(test_cublas('test_cublasCgeam'))
    s.addTest(test_cublas('test_cublasSgemmBatched'))
    s.addTest(test_cublas('test_cublasCgemmBatched'))
    s.addTest(test_cublas('test_cublasStrsmBatched'))
    s.addTest(test_cublas('test_cublasSgetrfBatched'))
    if misc.get_compute_capability(device) >= 1.3:
        s.addTest(test_cublas('test_cublasIdamax'))
        s.addTest(test_cublas('test_cublasIzamax'))
        s.addTest(test_cublas('test_cublasIdamin'))
        s.addTest(test_cublas('test_cublasIzamin'))
        s.addTest(test_cublas('test_cublasDasum'))
        s.addTest(test_cublas('test_cublasDzasum'))
        s.addTest(test_cublas('test_cublasDaxpy'))
        s.addTest(test_cublas('test_cublasZaxpy'))
        s.addTest(test_cublas('test_cublasDcopy'))
        s.addTest(test_cublas('test_cublasZcopy'))
        s.addTest(test_cublas('test_cublasDdot'))
        s.addTest(test_cublas('test_cublasZdotu'))
        s.addTest(test_cublas('test_cublasZdotc'))
        s.addTest(test_cublas('test_cublasDrnm2'))
        s.addTest(test_cublas('test_cublasDzrnm2'))
        s.addTest(test_cublas('test_cublasDscal'))
        s.addTest(test_cublas('test_cublasZscal'))
        s.addTest(test_cublas('test_cublasZdscal'))
        s.addTest(test_cublas('test_cublasDswap'))
        s.addTest(test_cublas('test_cublasZswap'))
        s.addTest(test_cublas('test_cublasDgemv'))
        s.addTest(test_cublas('test_cublasZgemv'))
        s.addTest(test_cublas('test_cublasDgeam'))
        s.addTest(test_cublas('test_cublasZgeam'))
        s.addTest(test_cublas('test_cublasDgemmBatched'))
        s.addTest(test_cublas('test_cublasZgemmBatched'))
        s.addTest(test_cublas('test_cublasDtrsmBatched'))
        s.addTest(test_cublas('test_cublasDgetrfBatched'))
    return s
예제 #51
0
from pycuda.tools import PageLockedMemoryPool
import numpy as np
import time
import ctypes
import pdb
from queue import Queue
from threading import Thread
from pycuda.tools import make_default_context
import matplotlib.pyplot as plt

import threading
# Initialize CUDA
cuda.init()

global ctx
ctx = make_default_context()  # will initialize the first device it finds
dev = ctx.get_device()


def _finish_up():
    global ctx
    ctx.pop()
    ctx = None

    from pycuda.tools import clear_context_caches
    clear_context_caches()


import atexit
atexit.register(_finish_up)
예제 #52
0
 def setUpClass(cls):
     cls.ctx = make_default_context()
     cls.cublas_handle = cublas.cublasCreate()
예제 #53
0
from __future__ import absolute_import
import pycuda.driver as cuda

# Initialize CUDA
cuda.init()

from pycuda.tools import make_default_context
global context
context = make_default_context()
device = context.get_device()

def _finish_up():
    global context
    context.pop()
    context = None

    from pycuda.tools import clear_context_caches
    clear_context_caches()

import atexit
atexit.register(_finish_up)
예제 #54
0
from __future__ import absolute_import
import pycuda.driver as cuda
import pycuda.gl as cudagl

cuda.init()
assert cuda.Device.count() >= 1

from pycuda.tools import make_default_context
context = make_default_context(lambda dev: cudagl.make_context(dev))
device = context.get_device()

import atexit
atexit.register(context.pop)