def init_kernels(): global sum_cols_kernel, iadd_kernel, multiply_kernel, m_dot_kernel, \ mv_batched_kernel, initialized if initialized: warnings.warn("Kernels already initialized, skipping init") return from pycuda import autoinit, driver, compiler from skcuda import misc dev = autoinit.device print("GPU found, using %s %s" % (dev.name(), dev.compute_capability())) misc.init() DTYPES = ["double", "float"] def parse_kernels(): with open(os.path.join(os.path.dirname(__file__), "kernels.cu")) as f: code = f.read() code = code.replace("%tile_len%", "32") funcs = code.split("__global__ void") new_funcs = [] for f in funcs: if "%float_type%" in f: for t in DTYPES: new_funcs += [f.replace("%float_type%", t)] else: new_funcs += [f] funcs = new_funcs new_funcs = [] for f in funcs: if "%transpose_a%" in f: for t_a in ["0", "1"]: new_funcs += [f.replace("%transpose_a%", t_a)] else: new_funcs += [f] funcs = new_funcs new_funcs = [] for f in funcs: if "%transpose_b%" in f: for t_b in ["0", "1"]: new_funcs += [f.replace("%transpose_b%", t_b)] else: new_funcs += [f] code = "__global__ void".join(new_funcs) return code try: kernels = compiler.SourceModule(parse_kernels()) except driver.CompileError: with open("kernel_code.txt", "w") as f: for i, line in enumerate(parse_kernels().split("\n")): f.write("%03d %s\n" % (i, line)) raise sum_cols_kernel = [ kernels.get_function("sum_cols_%s" % dtype).prepare("PPiii") for dtype in DTYPES ] iadd_kernel = [ kernels.get_function("iadd_%s" % dtype).prepare("PPii") for dtype in DTYPES ] multiply_kernel = [ kernels.get_function("multiply_%s" % dtype).prepare("PPPii") for dtype in DTYPES ] m_dot_kernel = [[[ kernels.get_function("shared_m_dot_%s_%s_%s" % (dtype, a, b)).prepare("PPPiiii") for b in ["0", "1"] ] for a in ["0", "1"]] for dtype in DTYPES] mv_batched_kernel = [[ kernels.get_function("mv_batched_%s_%s" % (dtype, a)).prepare("PPPiii") for a in ["0", "1"] ] for dtype in DTYPES] initialized = True
## check gpu functionality! ## global gpu_func gpu_func = True try: __import__("pycuda") except ImportError: gpu_func = False else: import pycuda.autoinit import pycuda.gpuarray as gpuarray import skcuda.fft as cu_fft from skcuda import misc misc.init() def to_gpu_c(somedata): # all my complex data return gpuarray.to_gpu(somedata.astype('complex64')) def to_gpu_f(somedata): # all my float data return gpuarray.to_gpu(somedata.astype('float32')) ## check gpu functionality! ## class MFTIE(): """
def setUp(self): np.random.seed(0) misc.init()
assert excProfile.shape == refProfile.shape and excProfile.ndim == 1, "Slice profiles must be one-dimensional vectors and contain the same number of samples" ########################################################### ## Initialization ########################################################### if useGPU: import pycuda.driver as cuda import pycuda.autoinit import skcuda.linalg as sklinalg import skcuda.misc as skmisc from FatFractionLookup_GPU import FatFractionLookup_GPU as FatFractionLookup import findmax_ff skmisc.init() NTHREADS = 1 else: from FatFractionLookup import FatFractionLookup [dicomStack, infos] = load3dDicom(baseDir) etl = int(infos[0].EchoTrainLength) echoSpacing = float(infos[0].EchoTime) oldShape = dicomStack.shape newShape = (oldShape[0], oldShape[1], etl, int(oldShape[2]/etl)) print(newShape) nSlices = newShape[3]
from sklearn.exceptions import ConvergenceWarning from ..utils.extmath import safe_sparse_dot from sklearn.utils.validation import check_is_fitted from sklearn.utils.multiclass import _check_partial_fit_first_call, unique_labels from sklearn.utils.multiclass import type_of_target from ..base import RegressorMixin import pycuda.driver as cuda import pycuda.autoinit from pycuda.compiler import SourceModule import pycuda.gpuarray as gpuarray import skcuda.misc as cumisc import skcuda.linalg as culinalg cumisc.init() culinalg.init() _STOCHASTIC_SOLVERS = ['sgd', 'adam'] def _pack(coefs_, intercepts_): """Pack the parameters into a single vector.""" return np.hstack([l.ravel() for l in coefs_ + intercepts_]) class BaseMultilayerPerceptron(six.with_metaclass(ABCMeta, BaseEstimator)): """Base class for MLP classification and regression. Warning: This class should not be used directly. Use derived classes instead.
import copy from functools import wraps import warnings import numpy as np from pycuda import gpuarray from skcuda import cublas, misc import hessianfree as hf misc.init() def debug_wrapper(cpu_func, debug=False): """Decorator used to specify an equivalent CPU function that can be used to verify the output of a GPU function (for debugging).""" def debug_func_parametrized(gpu_func): @wraps(gpu_func) def debug_func(*args, **kwargs): if debug: cpu_args = list(args) for i, a in enumerate(cpu_args): if isinstance(a, gpuarray.GPUArray): cpu_args[i] = a.get() cpu_kwargs = copy.copy(kwargs) for k in cpu_kwargs: if isinstance(cpu_kwargs[k], gpuarray.GPUArray): cpu_kwargs[k] = cpu_kwargs[k].get() cpu_kwargs.pop("stream", None)
def setUpClass(cls): cls.ctx = make_default_context() misc.init()
def decompose(d, beta, betaT, _gamma, verbose): print('start decomposing') sk_misc.init() D = gpuarray.to_gpu(d) Beta = gpuarray.to_gpu(beta) BetaT = gpuarray.to_gpu(betaT) l, m, n = D.shape step_size = np.maximum(m, n) lmn, k = Beta.shape kernel_code = kernel_code_template % { 'MATRIX_A_COL_SIZE': lmn, 'BLOCK_ROW_SIZE': 1, 'BLOCK_COL_SIZE': 1, 'STEP_SIZE': step_size, } mod = compiler.SourceModule(kernel_code) matrixmul = mod.get_function("MatrixMulKernel") grid = (1, k) block = (step_size, 1, 1) D_v = D.reshape(l * m * n, 1) _, k = Beta.shape tol = 0.2 max_iter = 10000 tau = 0.1 sigma = 1 / (12 * tau) x_t = gpuarray.zeros((l, m, n), D.dtype) x_a = gpuarray.zeros((k, 1), D.dtype) y_t = gpuarray.zeros((3, l, m, n), D.dtype) EL, ET, E = computeEnergy(D_v, x_t, _gamma, x_a, Beta) print('Initial Energy: E = ' + str(E) + ', EL=' + str(EL) + ', ET= ' + str(ET)) Es = E change = 10 t0 = time.clock() for i in range(max_iter): ks_yt = -div(y_t) xt_new, xa_new = ProxG(x_t - tau * ks_yt, x_a, D_v, tau, Beta, BetaT, matrixmul, grid, block) yt_new = ProxFSs(y_t + sigma * grad(2 * xt_new - x_t), _gamma) x_t = xt_new x_a = xa_new y_t = yt_new EL, ET, E = computeEnergy(D_v, x_t, _gamma, x_a, Beta) Es = np.append(Es, E) length = Es.shape[0] El5 = np.mean(Es[np.maximum(0, length - 6):length - 1]) El5c = np.mean(Es[np.maximum(0, length - 5):length]) change = np.append(change, El5c - El5) t1 = time.clock() - t0 if np.mod(i + 1, 100) == 0: print('Iter ' + str(i + 1) + ': E = ' + str(E) + '; EL=' + str(EL) + ', ET=' + str(ET) + ', avechg = ' + str(change[length - 1])) if i >= 100 and np.max(np.abs( change[np.maximum(0, length - 3):length])) < tol: print('Converged after ' + str(i + 1) + ' iterations.') break T = x_t Alpha = x_a L = D - T l = L.get() t = T.get() alpha = Alpha.get() sk_misc.shutdown() return (l, t, alpha)
def init_kernels(): global sum_cols_kernel, iadd_kernel, multiply_kernel, m_dot_kernel, \ mv_batched_kernel, initialized if initialized: warnings.warn("Kernels already initialized, skipping init") return from pycuda import autoinit, driver, compiler from skcuda import misc dev = autoinit.device print("GPU found, using %s %s" % (dev.name(), dev.compute_capability())) misc.init() DTYPES = ["double", "float"] def parse_kernels(): with open(os.path.join(os.path.dirname(__file__), "kernels.cu")) as f: code = f.read() code = code.replace("%tile_len%", "32") funcs = code.split("__global__ void") new_funcs = [] for f in funcs: if "%float_type%" in f: for t in DTYPES: new_funcs += [f.replace("%float_type%", t)] else: new_funcs += [f] funcs = new_funcs new_funcs = [] for f in funcs: if "%transpose_a%" in f: for t_a in ["0", "1"]: new_funcs += [f.replace("%transpose_a%", t_a)] else: new_funcs += [f] funcs = new_funcs new_funcs = [] for f in funcs: if "%transpose_b%" in f: for t_b in ["0", "1"]: new_funcs += [f.replace("%transpose_b%", t_b)] else: new_funcs += [f] code = "__global__ void".join(new_funcs) return code try: kernels = compiler.SourceModule(parse_kernels()) except driver.CompileError: with open("kernel_code.txt", "w") as f: for i, line in enumerate(parse_kernels().split("\n")): f.write("%03d %s\n" % (i, line)) raise sum_cols_kernel = [kernels.get_function("sum_cols_%s" % dtype).prepare("PPiii") for dtype in DTYPES] iadd_kernel = [kernels.get_function("iadd_%s" % dtype).prepare("PPii") for dtype in DTYPES] multiply_kernel = [kernels.get_function("multiply_%s" % dtype).prepare("PPPii") for dtype in DTYPES] m_dot_kernel = [[[kernels.get_function("shared_m_dot_%s_%s_%s" % (dtype, a, b)).prepare("PPPiiii") for b in ["0", "1"]] for a in ["0", "1"]] for dtype in DTYPES] mv_batched_kernel = [[kernels.get_function("mv_batched_%s_%s" % (dtype, a)).prepare("PPPiii") for a in ["0", "1"]] for dtype in DTYPES] initialized = True
def decompose(d, beta, betaT, _lambda, _gamma, _lambda_c, _gamma_c, verbose): print 'start decomposing in GPU' sk_misc.init() D = gpuarray.to_gpu(d) Beta = gpuarray.to_gpu(beta) BetaT = gpuarray.to_gpu(betaT) l, m, n = D.shape step_size = np.maximum(m, n) lmn, k = Beta.shape _Lambda = gpuarray.to_gpu(_lambda) _Gamma = gpuarray.to_gpu(_gamma) kernel_code = kernel_code_template % { 'MATRIX_A_COL_SIZE': lmn, 'BLOCK_ROW_SIZE': 1, 'BLOCK_COL_SIZE': 1, 'STEP_SIZE': step_size, } mod = compiler.SourceModule(kernel_code) matrixmul = mod.get_function("MatrixMulKernel") grid = (1, k) block = (step_size, 1, 1) D_v = D.reshape(lmn, 1) tol = 0.1 max_iter = 10000 tau = 0.1 sigma = 1 / (13 * tau) x_s = gpuarray.zeros((l, m, n), D.dtype) x_t = gpuarray.zeros((l, m, n), D.dtype) x_a = gpuarray.zeros((k, 1), D.dtype) y_t = gpuarray.zeros((3, l, m, n), D.dtype) y_s = x_s xs_new = gpuarray.zeros_like(x_s) xt_new = gpuarray.zeros_like(x_t) xa_new = gpuarray.zeros_like(x_a) ys_new = gpuarray.zeros_like(y_s) yt_new = gpuarray.zeros_like(y_t) assign_matrix(x_s, _Lambda, D) EL, ES, ET, Es = computeEnergy(D_v, x_s, x_t, _Lambda, _gamma_c, x_a, Beta) print 'Initial Energy: E = ' + str(Es) + ', EL=' + str(EL) + ', ES=' + str( ES) + ', ET=' + str(ET) change = 10 t0 = time.time() print_iters = 200 if verbose == True: print_iters = 50 for i in range(max_iter): ks_yt = -div(y_t) ks_ys = y_s xs_new, xt_new, xa_new = ProxG(x_s - tau * ks_ys, x_t - tau * ks_yt, x_a, D_v, tau, Beta, BetaT, matrixmul, grid, block) assign_matrix(xs_new, _Lambda, D) assign_matrix(xt_new, _Gamma, D) ys_new, yt_new = ProxFSs(y_s + sigma * (2 * xs_new - x_s), y_t + sigma * grad(2 * xt_new - x_t), _Lambda, _gamma_c) x_s = xs_new x_t = xt_new x_a = xa_new y_s = ys_new y_t = yt_new EL, ES, ET, E = computeEnergy(D_v, x_s, x_t, _Lambda, _gamma_c, x_a, Beta) Es = np.append(Es, E) length = Es.shape[0] El5 = np.mean(Es[np.maximum(0, length - 6):length - 1]) El5c = np.mean(Es[np.maximum(0, length - 5):length]) change = np.append(change, El5c - El5) t1 = time.time() - t0 if np.mod(i + 1, print_iters) == 0: print 'Iter ' + str(i + 1) + ': E = ' + str(E) + '; EL=' + str( EL) + ', ES=' + str(ES) + ', ET=' + str( ET) + ', aechg = ' + str(change[length - 1]) if i >= 100 and np.max(np.abs( change[np.maximum(0, length - 3):length])) < tol: print 'Iter ' + str(i + 1) + ': E = ' + str(E) + '; EL=' + str( EL) + ', ES=' + str(ES) + ', ET=' + str( ET) + ', aechg = ' + str(change[length - 1]) print 'Converged after ' + str(i + 1) + ' iterations.' break S = x_s T = x_t Alpha = x_a L = D - S - T l = L.get() s = S.get() t = T.get() alpha = Alpha.get() sk_misc.shutdown() return (l, s, t, alpha)
def setUpClass(cls): misc.init()