예제 #1
0
 def pycuda_deallocation(self):
     # pycuda dealloc
     global context
     context.pop()
     context = None
     from pycuda.tools import clear_context_caches
     clear_context_caches()
예제 #2
0
def _finish_up():
    global context
    context.pop()
    context = None

    from pycuda.tools import clear_context_caches
    clear_context_caches()
예제 #3
0
def context_cleanup():
    #print("CUDA Context cleanup")
    global CONTEXT
    CONTEXT.synchronize()
    CONTEXT.pop()
    CONTEXT = None

    from pycuda.tools import clear_context_caches
    clear_context_caches()
예제 #4
0
def _finish_up():
    global is_initialized
    if is_initialized:
        global context
        context.pop()
        context = None

        from pycuda.tools import clear_context_caches
        clear_context_caches()
        is_initialized = False
예제 #5
0
def _finish_up():
    global is_initialized
    if is_initialized:
        global context
        context.pop()
        context = None

        from pycuda.tools import clear_context_caches
        clear_context_caches()
        is_initialized = False
예제 #6
0
파일: util.py 프로젝트: whitews/gpustats
def clean_all_contexts():

    ctx = True
    while ctx is not None:
        ctx = drv.Context.get_current()
        if ctx is not None:
            ctx.detach()

    from pycuda.tools import clear_context_caches
    clear_context_caches()
예제 #7
0
def clean_all_contexts():

    ctx = True
    while ctx is not None:
        ctx = drv.Context.get_current()
        if ctx is not None:
            ctx.detach()

    from pycuda.tools import clear_context_caches
    clear_context_caches()
예제 #8
0
 def freeMem(self):
     self.context.push()
     self.GPU_Lattice.free()
     self.GPU_params.free()
     self.QField.free()
     self.QFieldCopy.free()
     self.context.pop()
     tools.clear_context_caches()
     gc.collect()
     self.context.detach()
     print "Memory Freed for device ", self.deviceNum
예제 #9
0
def clean_cuda(context):
    #Before cuda context is destroyed, all item destructions dependent on cuda must take place
    #This calls all functions that have been registered with _register_clean_cuda() in reverse order
    #So the last one registered, is the first one cleaned
    _cuda_cleanup_list.reverse()
    for func in _cuda_cleanup_list:
        func()

    context.pop()
    from pycuda.tools import clear_context_caches
    clear_context_caches()
예제 #10
0
 def _clean_up():
     global ctx
     if ctx is not None:
         try:#global ctx
             #ctx.push()
             ctx.pop()
             ctx.detach()
             #ctx = None
         except:
             pass
     from pycuda.tools import clear_context_caches
     clear_context_caches()
예제 #11
0
def clean_cuda(context):
    #Before cuda context is destroyed, all item destructions dependent on cuda
    # must take place. This calls all functions that have been registered
    # with _register_clean_cuda() in reverse order
    #So the last one registered, is the first one cleaned
    _cuda_cleanup_list.reverse()
    for func in _cuda_cleanup_list:
        func()

    context.pop()
    from pycuda.tools import clear_context_caches
    clear_context_caches()
예제 #12
0
        def _finish_up(ctx):
            print '\n\nWrapping up thread %d...\n\n' % (device_num)
            sys.stdout.flush()
            ctx.pop()

            from pycuda.tools import clear_context_caches
            clear_context_caches()

            # put something in output queue to satisfy
            # parent's map
            self.q_out.put([0, device_num])
            sys.exit()
예제 #13
0
파일: util.py 프로젝트: xiangze/gpustats
def threadSafeInit(device=0):
    """
    If gpustats (or any other pycuda work) is used inside a 
    multiprocessing.Process, this function must be used inside the
    thread to clean up invalid contexts and create a new one on the 
    given device. Assumes one GPU per thread.
    """

    import atexit
    drv.init()  # just in case

    ## clean up all contexts. most will be invalid from
    ## multiprocessing fork
    import os
    import sys
    clean = False
    while not clean:
        _old_ctx = drv.Context.get_current()
        if _old_ctx is None:
            clean = True
        else:
            ## detach: will give warnings to stderr if invalid
            _old_cerr = os.dup(sys.stderr.fileno())
            _nl = os.open(os.devnull, os.O_RDWR)
            os.dup2(_nl, sys.stderr.fileno())
            _old_ctx.detach()
            sys.stderr = os.fdopen(_old_cerr, "wb")
            os.close(_nl)
    from pycuda.tools import clear_context_caches
    clear_context_caches()

    ## init a new device
    dev = drv.Device(device)
    ctx = dev.make_context()

    ## pycuda.autoinit exitfunc is bad now .. delete it
    exit_funcs = atexit._exithandlers
    for fn in exit_funcs:
        if hasattr(fn[0], 'func_name'):
            if fn[0].func_name == '_finish_up':
                exit_funcs.remove(fn)
            if fn[0].func_name == 'clean_all_contexts':  # avoid duplicates
                exit_funcs.remove(fn)

    ## make sure we clean again on exit
    atexit.register(clean_all_contexts)
예제 #14
0
def threadSafeInit(device = 0):
    """
    If gpustats (or any other pycuda work) is used inside a 
    multiprocessing.Process, this function must be used inside the
    thread to clean up invalid contexts and create a new one on the 
    given device. Assumes one GPU per thread.
    """

    import atexit
    drv.init() # just in case

    ## clean up all contexts. most will be invalid from
    ## multiprocessing fork
    import os; import sys
    clean = False
    while not clean:
        _old_ctx = drv.Context.get_current()
        if _old_ctx is None:
            clean = True
        else:
            ## detach: will give warnings to stderr if invalid
            _old_cerr = os.dup(sys.stderr.fileno())
            _nl = os.open(os.devnull, os.O_RDWR)
            os.dup2(_nl, sys.stderr.fileno())
            _old_ctx.detach() 
            sys.stderr = os.fdopen(_old_cerr, "wb")
            os.close(_nl)
    from pycuda.tools import clear_context_caches
    clear_context_caches()
        
    ## init a new device
    dev = drv.Device(device)
    ctx = dev.make_context()

    ## pycuda.autoinit exitfunc is bad now .. delete it
    exit_funcs = atexit._exithandlers
    for fn in exit_funcs:
        if hasattr(fn[0], 'func_name'):
            if fn[0].func_name == '_finish_up':
                exit_funcs.remove(fn)
            if fn[0].func_name == 'clean_all_contexts': # avoid duplicates
                exit_funcs.remove(fn)

    ## make sure we clean again on exit
    atexit.register(clean_all_contexts)
예제 #15
0
    def f(*args, **kwargs):
        import pycuda.driver
        # appears to be idempotent, i.e. no harm in calling it more than once
        pycuda.driver.init()

        ctx = make_default_context()
        try:
            assert isinstance(ctx.get_device().name(), str)
            assert isinstance(ctx.get_device().compute_capability(), tuple)
            assert isinstance(ctx.get_device().get_attributes(), dict)
            inner_f(*args, **kwargs)
        finally:
            ctx.pop()

            from pycuda.tools import clear_context_caches
            clear_context_caches()

            from gc import collect
            collect()
예제 #16
0
    def f(*args, **kwargs):
        import pycuda.driver
        # appears to be idempotent, i.e. no harm in calling it more than once
        pycuda.driver.init()

        ctx = make_default_context()
        try:
            assert isinstance(ctx.get_device().name(), str)
            assert isinstance(ctx.get_device().compute_capability(), tuple)
            assert isinstance(ctx.get_device().get_attributes(), dict)
            inner_f(*args, **kwargs)
        finally:
            ctx.pop()

            from pycuda.tools import clear_context_caches
            clear_context_caches()

            from gc import collect
            collect()
예제 #17
0
 def cleanup():
     ctx.pop()
     tools.clear_context_caches()
예제 #18
0
    def __exit__(self, exc_type, exc_value, traceback):
        self.context.pop()
        self.context = None

        clear_context_caches()
예제 #19
0
def mf_rmse(U, V, users, movies, ratings, split, latent=30, debug=1):

    us = int(math.ceil(np.float(np.max(users)) / split))
    vs = int(math.ceil(np.float(np.max(movies)) / split))

    u1, v1 = 0, 0
    error = 0.0
    totnum = 0
    totmse = 0.0
    t4 = time.clock()
    for i in range(us):

        u1 = i * split
        if np.max(users) < u1:
            u1 = int(np.max(users))

        u2 = ((i + 1) * split - 1)
        if np.max(users) < u2:
            u2 = int(np.max(users))

        for j in range(vs):
            v1 = j * split
            if np.max(movies) < v1:
                v1 = int(np.max(movies))

            v2 = (j + 1) * split - 1
            if np.max(movies) < v2:
                v2 = int(np.max(movies))

            if debug > 1:
                print("Processing split : ", i, j, u1, u2, v1, v2)

            uu, mm, rr = fetch(u1, u2, v1, v2, users, movies, ratings)
            if debug > 1:
                print("Shapes of uu,mm,rr :", uu.shape, mm.shape, rr.shape)

            t6 = time.clock()
            P, Q = U[u1:u2 + 1, 0:latent], V[0:latent, v1:v2 + 1]
            P = P.reshape(P.shape[0] * P.shape[1], 1).astype(np.float32)
            Q = Q.reshape(Q.shape[0] * Q.shape[1], 1).astype(np.float32)

            tools.clear_context_caches()
            a_gpu = gpuarray.to_gpu(P)
            b_gpu = gpuarray.to_gpu(Q)

            t7 = time.clock()
            u_gpu = gpuarray.to_gpu(uu)
            v_gpu = gpuarray.to_gpu(mm)
            r_gpu = gpuarray.to_gpu(rr)

            ex_gpu = gpuarray.zeros((3072, 1), np.float32)
            ey_gpu = gpuarray.zeros((3072, 1), np.int32)

            if len(uu) > 0:
                rmse(a_gpu,
                     b_gpu,
                     u_gpu,
                     v_gpu,
                     r_gpu,
                     ex_gpu,
                     ey_gpu,
                     np.int32(u2 - u1 + 1),
                     np.int32(latent),
                     np.int32(v2 - v1 + 1),
                     np.int32(u1),
                     np.int32(u2),
                     np.int32(v1),
                     np.int32(v2),
                     np.int32(len(uu)),
                     np.int32(len(mm)),
                     block=(16, 16, 1),
                     grid=(3, 4))
                ex = ex_gpu.get()
                ey = ey_gpu.get()
                num = np.sum(ey)
                mse = np.sum(np.dot(ex.T, ey))
                temp = np.float((totnum + num))

                error = error * (totnum / temp) + (mse / temp)
                totnum += num
                totmse += mse
                if debug > 1:
                    print(" mse , error ", totmse, mse, mse / num, error, num,
                          len(uu))

            t8 = time.clock()

    return np.sqrt(error)
예제 #20
0
 def tearDownClass(cls):
     cls.ctx.pop()
     clear_context_caches()
예제 #21
0
 def tearDownClass(cls):
     cublas.cublasDestroy(cls.cublas_handle)
     cls.ctx.pop()
     clear_context_caches()
예제 #22
0
 def finish_up(self):
     self.context.pop()
     self.context = None
     from pycuda.tools import clear_context_caches
     clear_context_caches()
예제 #23
0
        def _finish_up(ctx):
            print 'wrapping up'
            ctx.pop()

            from pycuda.tools import clear_context_caches
            clear_context_caches()
예제 #24
0
 def _finish_up(context):
     if context is not None:
         context.pop()
         context = None
     clear_context_caches()
예제 #25
0
 def tearDownClass(cls):
     misc.shutdown()
     cls.ctx.pop()
     clear_context_caches()
예제 #26
0
 def clear_cuda_context():
     from pycuda.tools import clear_context_caches
     CONTEXT.pop()
     clear_context_caches()
예제 #27
0
def recompile_all(function_name, kernel_src, device_ids=None):
    global KERNEL_cubins
    KERNEL_cubins = {}
    tools.clear_context_caches()
    compile_all(function_name, kernel_src, device_ids)
예제 #28
0
 def tearDownClass(cls):
     cublas.cublasDestroy(cls.cublas_handle)
     cls.ctx.pop()
     clear_context_caches()
예제 #29
0
 def cleanup():
     ctx.pop()
     tools.clear_context_caches()
예제 #30
0
import numpy as np  # type: ignore

# Attempt to setup CUDA. This may fail if the pycuda package is not
# installed or if it is installed but there are no devices available.
try:
    import pycuda.driver as cuda  # type: ignore
    from pycuda.compiler import SourceModule
    from pycuda.tools import make_default_context, clear_context_caches
    # Ask CUDA for the default context (so that we know that one exists)
    # then immediately throw it away in case the user doesn't want it.
    # Note: cribbed from pycuda.autoinit
    cuda.init()
    context = make_default_context()
    context.pop()
    clear_context_caches()
    del context
    HAVE_CUDA = True
    CUDA_ERROR = ""
except Exception as exc:
    HAVE_CUDA = False
    CUDA_ERROR = str(exc)

from . import generate
from .kernel import KernelModel, Kernel

# pylint: disable=unused-import
try:
    from typing import Tuple, Callable, Any
    from .modelinfo import ModelInfo
    from .details import CallDetails
예제 #31
0
 def tearDownClass(cls):
     integrate.shutdown()
     cls.ctx.pop()
     clear_context_caches()
예제 #32
0
 def tearDownClass(cls):
     magma.magma_finalize()
     cls.ctx.pop()
     clear_context_caches()
예제 #33
0
    def __finalize(self):
        self.context.pop()
        self.context = None

        clear_context_caches()
예제 #34
0
 def tearDownClass(cls):
     cls.ctx.pop()
     clear_context_caches()
예제 #35
0
 def tearDownClass(cls):
     integrate.shutdown()
     cls.ctx.pop()
     clear_context_caches()
예제 #36
0
파일: cudaCGH.py 프로젝트: mal858/pyfab
 def stop(self):
     super(cudaCGH, self).stop()
     self.context.pop()
     self.context = None
     tools.clear_context_caches()
예제 #37
0
def recompile_all(function_name, kernel_src, device_ids=None):
    global KERNEL_cubins
    KERNEL_cubins = {}
    tools.clear_context_caches()
    compile_all(function_name, kernel_src, device_ids)
예제 #38
0
def factorize(users,
              movies,
              ratings,
              test_users,
              test_movies,
              test_ratings,
              latent=30,
              steps=10,
              gpu_steps=1,
              alpha=0.0002,
              beta=0.02,
              delta=0.01,
              rmse_repeat_count=5,
              debug=1):

    U, V = initUV(int(np.max(users) + 1), latent, int(np.max(movies) + 1))
    U, V = np.array(U).astype(np.float32), np.array(V).astype(
        np.float32).transpose()

    print("Shape of P,Q : ", U.shape, V.shape)

    start_time = time.clock()
    y1, y2 = [], []

    error, count = rmse(test_users, test_movies, test_ratings, U, V.T), 0
    print("Initial test error :", round(error, 4))

    for k in range(steps):

        if debug > 1:
            print("Step : ", k)

        t6 = time.clock()

        uu, mm, rr = np.array(users).astype(np.int32), np.array(movies).astype(
            np.int32), np.array(ratings).astype(np.int32)

        t7 = time.clock()
        tools.clear_context_caches()
        u_gpu = gpuarray.to_gpu(uu)
        v_gpu = gpuarray.to_gpu(mm)
        r_gpu = gpuarray.to_gpu(rr)

        a_gpu = gpuarray.to_gpu(U)
        b_gpu = gpuarray.to_gpu(V)

        if debug > 1:
            print("Length of uu,mm ", len(uu), len(mm), np.max(users),
                  np.max(movies), U.shape, V.shape)

        if (len(uu) != 0 and len(mm) != 0):
            matrixfact(
                u_gpu,
                v_gpu,
                r_gpu,
                a_gpu,
                b_gpu,
                np.int32(np.max(users)),
                np.int32(latent),
                np.int32(np.max(movies)),
                np.int32(len(uu)),
                np.int32(len(mm)),
                np.int32(gpu_steps),
                np.float32(alpha),
                np.float32(beta),
                np.float32(delta),
                block=(16, 16, 1),
                grid=(
                    3, 4
                )  # always keep blockIdx.z as 1 - the kernal expects no threads in z axis
            )
            P = a_gpu.get()
            Q = b_gpu.get()
            U, V = np.array(P), np.array(Q)
            t8 = time.clock()

            if debug > 1:
                t9 = time.clock()
                if debug > 2:
                    np.savetxt('U' + str(k), U, fmt='%.4f')
                    np.savetxt('V' + str(k), V, fmt='%.4f')
                print("Timer :", round(t7 - t6, 4), round(t8 - t7, 4),
                      round(t9 - t8, 4))

        t5 = time.clock()
        if debug > 1:
            print("Step time taken : ", round(t5 - t7, 2))
        y1.append(t5 - start_time)
        test_rmse = rmse(test_users, test_movies, test_ratings, U, V.T)
        print("Step test error :", round(test_rmse, 4))

        train_rmse = rmse(users, movies, ratings, U, V.T)
        y2.append([train_rmse, test_rmse])

        step_error = round(test_rmse, 4)

        if step_error < delta:
            break
        elif step_error == error:
            count = count + 1
        elif step_error > error:
            break
        elif rmse_repeat_count == count:
            break
        else:
            error = step_error

    if debug > 1:
        np.savetxt('gpmf-' + str(start_time) + '-y1.txt', y1, fmt='%.4f')
        np.savetxt('gpmf-' + str(start_time) + '-y2.txt', y2, fmt='%.4f')
예제 #39
0
 def tearDownClass(cls):
     misc.shutdown()
     cls.ctx.pop()
     clear_context_caches()