def pycuda_deallocation(self): # pycuda dealloc global context context.pop() context = None from pycuda.tools import clear_context_caches clear_context_caches()
def _finish_up(): global context context.pop() context = None from pycuda.tools import clear_context_caches clear_context_caches()
def context_cleanup(): #print("CUDA Context cleanup") global CONTEXT CONTEXT.synchronize() CONTEXT.pop() CONTEXT = None from pycuda.tools import clear_context_caches clear_context_caches()
def _finish_up(): global is_initialized if is_initialized: global context context.pop() context = None from pycuda.tools import clear_context_caches clear_context_caches() is_initialized = False
def _finish_up(): global is_initialized if is_initialized: global context context.pop() context = None from pycuda.tools import clear_context_caches clear_context_caches() is_initialized = False
def clean_all_contexts(): ctx = True while ctx is not None: ctx = drv.Context.get_current() if ctx is not None: ctx.detach() from pycuda.tools import clear_context_caches clear_context_caches()
def clean_all_contexts(): ctx = True while ctx is not None: ctx = drv.Context.get_current() if ctx is not None: ctx.detach() from pycuda.tools import clear_context_caches clear_context_caches()
def freeMem(self): self.context.push() self.GPU_Lattice.free() self.GPU_params.free() self.QField.free() self.QFieldCopy.free() self.context.pop() tools.clear_context_caches() gc.collect() self.context.detach() print "Memory Freed for device ", self.deviceNum
def clean_cuda(context): #Before cuda context is destroyed, all item destructions dependent on cuda must take place #This calls all functions that have been registered with _register_clean_cuda() in reverse order #So the last one registered, is the first one cleaned _cuda_cleanup_list.reverse() for func in _cuda_cleanup_list: func() context.pop() from pycuda.tools import clear_context_caches clear_context_caches()
def _clean_up(): global ctx if ctx is not None: try:#global ctx #ctx.push() ctx.pop() ctx.detach() #ctx = None except: pass from pycuda.tools import clear_context_caches clear_context_caches()
def clean_cuda(context): #Before cuda context is destroyed, all item destructions dependent on cuda # must take place. This calls all functions that have been registered # with _register_clean_cuda() in reverse order #So the last one registered, is the first one cleaned _cuda_cleanup_list.reverse() for func in _cuda_cleanup_list: func() context.pop() from pycuda.tools import clear_context_caches clear_context_caches()
def _finish_up(ctx): print '\n\nWrapping up thread %d...\n\n' % (device_num) sys.stdout.flush() ctx.pop() from pycuda.tools import clear_context_caches clear_context_caches() # put something in output queue to satisfy # parent's map self.q_out.put([0, device_num]) sys.exit()
def threadSafeInit(device=0): """ If gpustats (or any other pycuda work) is used inside a multiprocessing.Process, this function must be used inside the thread to clean up invalid contexts and create a new one on the given device. Assumes one GPU per thread. """ import atexit drv.init() # just in case ## clean up all contexts. most will be invalid from ## multiprocessing fork import os import sys clean = False while not clean: _old_ctx = drv.Context.get_current() if _old_ctx is None: clean = True else: ## detach: will give warnings to stderr if invalid _old_cerr = os.dup(sys.stderr.fileno()) _nl = os.open(os.devnull, os.O_RDWR) os.dup2(_nl, sys.stderr.fileno()) _old_ctx.detach() sys.stderr = os.fdopen(_old_cerr, "wb") os.close(_nl) from pycuda.tools import clear_context_caches clear_context_caches() ## init a new device dev = drv.Device(device) ctx = dev.make_context() ## pycuda.autoinit exitfunc is bad now .. delete it exit_funcs = atexit._exithandlers for fn in exit_funcs: if hasattr(fn[0], 'func_name'): if fn[0].func_name == '_finish_up': exit_funcs.remove(fn) if fn[0].func_name == 'clean_all_contexts': # avoid duplicates exit_funcs.remove(fn) ## make sure we clean again on exit atexit.register(clean_all_contexts)
def threadSafeInit(device = 0): """ If gpustats (or any other pycuda work) is used inside a multiprocessing.Process, this function must be used inside the thread to clean up invalid contexts and create a new one on the given device. Assumes one GPU per thread. """ import atexit drv.init() # just in case ## clean up all contexts. most will be invalid from ## multiprocessing fork import os; import sys clean = False while not clean: _old_ctx = drv.Context.get_current() if _old_ctx is None: clean = True else: ## detach: will give warnings to stderr if invalid _old_cerr = os.dup(sys.stderr.fileno()) _nl = os.open(os.devnull, os.O_RDWR) os.dup2(_nl, sys.stderr.fileno()) _old_ctx.detach() sys.stderr = os.fdopen(_old_cerr, "wb") os.close(_nl) from pycuda.tools import clear_context_caches clear_context_caches() ## init a new device dev = drv.Device(device) ctx = dev.make_context() ## pycuda.autoinit exitfunc is bad now .. delete it exit_funcs = atexit._exithandlers for fn in exit_funcs: if hasattr(fn[0], 'func_name'): if fn[0].func_name == '_finish_up': exit_funcs.remove(fn) if fn[0].func_name == 'clean_all_contexts': # avoid duplicates exit_funcs.remove(fn) ## make sure we clean again on exit atexit.register(clean_all_contexts)
def f(*args, **kwargs): import pycuda.driver # appears to be idempotent, i.e. no harm in calling it more than once pycuda.driver.init() ctx = make_default_context() try: assert isinstance(ctx.get_device().name(), str) assert isinstance(ctx.get_device().compute_capability(), tuple) assert isinstance(ctx.get_device().get_attributes(), dict) inner_f(*args, **kwargs) finally: ctx.pop() from pycuda.tools import clear_context_caches clear_context_caches() from gc import collect collect()
def f(*args, **kwargs): import pycuda.driver # appears to be idempotent, i.e. no harm in calling it more than once pycuda.driver.init() ctx = make_default_context() try: assert isinstance(ctx.get_device().name(), str) assert isinstance(ctx.get_device().compute_capability(), tuple) assert isinstance(ctx.get_device().get_attributes(), dict) inner_f(*args, **kwargs) finally: ctx.pop() from pycuda.tools import clear_context_caches clear_context_caches() from gc import collect collect()
def cleanup(): ctx.pop() tools.clear_context_caches()
def __exit__(self, exc_type, exc_value, traceback): self.context.pop() self.context = None clear_context_caches()
def mf_rmse(U, V, users, movies, ratings, split, latent=30, debug=1): us = int(math.ceil(np.float(np.max(users)) / split)) vs = int(math.ceil(np.float(np.max(movies)) / split)) u1, v1 = 0, 0 error = 0.0 totnum = 0 totmse = 0.0 t4 = time.clock() for i in range(us): u1 = i * split if np.max(users) < u1: u1 = int(np.max(users)) u2 = ((i + 1) * split - 1) if np.max(users) < u2: u2 = int(np.max(users)) for j in range(vs): v1 = j * split if np.max(movies) < v1: v1 = int(np.max(movies)) v2 = (j + 1) * split - 1 if np.max(movies) < v2: v2 = int(np.max(movies)) if debug > 1: print("Processing split : ", i, j, u1, u2, v1, v2) uu, mm, rr = fetch(u1, u2, v1, v2, users, movies, ratings) if debug > 1: print("Shapes of uu,mm,rr :", uu.shape, mm.shape, rr.shape) t6 = time.clock() P, Q = U[u1:u2 + 1, 0:latent], V[0:latent, v1:v2 + 1] P = P.reshape(P.shape[0] * P.shape[1], 1).astype(np.float32) Q = Q.reshape(Q.shape[0] * Q.shape[1], 1).astype(np.float32) tools.clear_context_caches() a_gpu = gpuarray.to_gpu(P) b_gpu = gpuarray.to_gpu(Q) t7 = time.clock() u_gpu = gpuarray.to_gpu(uu) v_gpu = gpuarray.to_gpu(mm) r_gpu = gpuarray.to_gpu(rr) ex_gpu = gpuarray.zeros((3072, 1), np.float32) ey_gpu = gpuarray.zeros((3072, 1), np.int32) if len(uu) > 0: rmse(a_gpu, b_gpu, u_gpu, v_gpu, r_gpu, ex_gpu, ey_gpu, np.int32(u2 - u1 + 1), np.int32(latent), np.int32(v2 - v1 + 1), np.int32(u1), np.int32(u2), np.int32(v1), np.int32(v2), np.int32(len(uu)), np.int32(len(mm)), block=(16, 16, 1), grid=(3, 4)) ex = ex_gpu.get() ey = ey_gpu.get() num = np.sum(ey) mse = np.sum(np.dot(ex.T, ey)) temp = np.float((totnum + num)) error = error * (totnum / temp) + (mse / temp) totnum += num totmse += mse if debug > 1: print(" mse , error ", totmse, mse, mse / num, error, num, len(uu)) t8 = time.clock() return np.sqrt(error)
def tearDownClass(cls): cls.ctx.pop() clear_context_caches()
def tearDownClass(cls): cublas.cublasDestroy(cls.cublas_handle) cls.ctx.pop() clear_context_caches()
def finish_up(self): self.context.pop() self.context = None from pycuda.tools import clear_context_caches clear_context_caches()
def _finish_up(ctx): print 'wrapping up' ctx.pop() from pycuda.tools import clear_context_caches clear_context_caches()
def _finish_up(context): if context is not None: context.pop() context = None clear_context_caches()
def tearDownClass(cls): misc.shutdown() cls.ctx.pop() clear_context_caches()
def clear_cuda_context(): from pycuda.tools import clear_context_caches CONTEXT.pop() clear_context_caches()
def recompile_all(function_name, kernel_src, device_ids=None): global KERNEL_cubins KERNEL_cubins = {} tools.clear_context_caches() compile_all(function_name, kernel_src, device_ids)
def tearDownClass(cls): cublas.cublasDestroy(cls.cublas_handle) cls.ctx.pop() clear_context_caches()
def cleanup(): ctx.pop() tools.clear_context_caches()
import numpy as np # type: ignore # Attempt to setup CUDA. This may fail if the pycuda package is not # installed or if it is installed but there are no devices available. try: import pycuda.driver as cuda # type: ignore from pycuda.compiler import SourceModule from pycuda.tools import make_default_context, clear_context_caches # Ask CUDA for the default context (so that we know that one exists) # then immediately throw it away in case the user doesn't want it. # Note: cribbed from pycuda.autoinit cuda.init() context = make_default_context() context.pop() clear_context_caches() del context HAVE_CUDA = True CUDA_ERROR = "" except Exception as exc: HAVE_CUDA = False CUDA_ERROR = str(exc) from . import generate from .kernel import KernelModel, Kernel # pylint: disable=unused-import try: from typing import Tuple, Callable, Any from .modelinfo import ModelInfo from .details import CallDetails
def tearDownClass(cls): integrate.shutdown() cls.ctx.pop() clear_context_caches()
def tearDownClass(cls): magma.magma_finalize() cls.ctx.pop() clear_context_caches()
def __finalize(self): self.context.pop() self.context = None clear_context_caches()
def tearDownClass(cls): cls.ctx.pop() clear_context_caches()
def tearDownClass(cls): integrate.shutdown() cls.ctx.pop() clear_context_caches()
def stop(self): super(cudaCGH, self).stop() self.context.pop() self.context = None tools.clear_context_caches()
def recompile_all(function_name, kernel_src, device_ids=None): global KERNEL_cubins KERNEL_cubins = {} tools.clear_context_caches() compile_all(function_name, kernel_src, device_ids)
def factorize(users, movies, ratings, test_users, test_movies, test_ratings, latent=30, steps=10, gpu_steps=1, alpha=0.0002, beta=0.02, delta=0.01, rmse_repeat_count=5, debug=1): U, V = initUV(int(np.max(users) + 1), latent, int(np.max(movies) + 1)) U, V = np.array(U).astype(np.float32), np.array(V).astype( np.float32).transpose() print("Shape of P,Q : ", U.shape, V.shape) start_time = time.clock() y1, y2 = [], [] error, count = rmse(test_users, test_movies, test_ratings, U, V.T), 0 print("Initial test error :", round(error, 4)) for k in range(steps): if debug > 1: print("Step : ", k) t6 = time.clock() uu, mm, rr = np.array(users).astype(np.int32), np.array(movies).astype( np.int32), np.array(ratings).astype(np.int32) t7 = time.clock() tools.clear_context_caches() u_gpu = gpuarray.to_gpu(uu) v_gpu = gpuarray.to_gpu(mm) r_gpu = gpuarray.to_gpu(rr) a_gpu = gpuarray.to_gpu(U) b_gpu = gpuarray.to_gpu(V) if debug > 1: print("Length of uu,mm ", len(uu), len(mm), np.max(users), np.max(movies), U.shape, V.shape) if (len(uu) != 0 and len(mm) != 0): matrixfact( u_gpu, v_gpu, r_gpu, a_gpu, b_gpu, np.int32(np.max(users)), np.int32(latent), np.int32(np.max(movies)), np.int32(len(uu)), np.int32(len(mm)), np.int32(gpu_steps), np.float32(alpha), np.float32(beta), np.float32(delta), block=(16, 16, 1), grid=( 3, 4 ) # always keep blockIdx.z as 1 - the kernal expects no threads in z axis ) P = a_gpu.get() Q = b_gpu.get() U, V = np.array(P), np.array(Q) t8 = time.clock() if debug > 1: t9 = time.clock() if debug > 2: np.savetxt('U' + str(k), U, fmt='%.4f') np.savetxt('V' + str(k), V, fmt='%.4f') print("Timer :", round(t7 - t6, 4), round(t8 - t7, 4), round(t9 - t8, 4)) t5 = time.clock() if debug > 1: print("Step time taken : ", round(t5 - t7, 2)) y1.append(t5 - start_time) test_rmse = rmse(test_users, test_movies, test_ratings, U, V.T) print("Step test error :", round(test_rmse, 4)) train_rmse = rmse(users, movies, ratings, U, V.T) y2.append([train_rmse, test_rmse]) step_error = round(test_rmse, 4) if step_error < delta: break elif step_error == error: count = count + 1 elif step_error > error: break elif rmse_repeat_count == count: break else: error = step_error if debug > 1: np.savetxt('gpmf-' + str(start_time) + '-y1.txt', y1, fmt='%.4f') np.savetxt('gpmf-' + str(start_time) + '-y2.txt', y2, fmt='%.4f')
def tearDownClass(cls): misc.shutdown() cls.ctx.pop() clear_context_caches()