def __init__(self): culinalg.init() self.handle = cublas.cublasCreate() self._elem_kernel = culinalg_kernel.get_function('_elem') self._sigmoid_kernel = culinalg_kernel.get_function('_sigmoid') self._log_anti_sigmoid_kernel = culinalg_kernel.get_function('_log_anti_sigmoid') self._tanh_kernel = culinalg_kernel.get_function('_tanh') self._pow_kernel = culinalg_kernel.get_function('_pow') self._sqrt_kernel = culinalg_kernel.get_function('_sqrt') self._square_kernel = culinalg_kernel.get_function('_square') self._exp_kernel = culinalg_kernel.get_function('_exp') self._log_kernel = culinalg_kernel.get_function('_log') self._sum_kernel = culinalg_kernel.get_function('_sum') self._compare_kernel = culinalg_kernel.get_function('_compare') self._reverse_kernel = culinalg_kernel.get_function('_reverse') self.X_max_kernel = culinalg_kernel.get_function('X_max') self.X_min_kernel = culinalg_kernel.get_function('X_min') self.X_sum_kernel = culinalg_kernel.get_function('X_sum') self.X_norm_kernel = culinalg_kernel.get_function('X_norm') self.s_mul_x_kernel = culinalg_kernel.get_function('s_mul_x') self.s_add_x_kernel = culinalg_kernel.get_function('s_add_x') self.x_add_y_kernel = culinalg_kernel.get_function('x_add_y') self.X_add_Y_kernel = culinalg_kernel.get_function('X_add_Y') self.x_mul_y_kernel = culinalg_kernel.get_function('x_mul_y') self.X_mul_Y_kernel = culinalg_kernel.get_function('X_mul_Y') self.x_div_y_kernel = culinalg_kernel.get_function('x_div_y') self.X_div_Y_kernel = culinalg_kernel.get_function('X_div_Y') self.x_radd_Y_as_Y_kernel = culinalg_kernel.get_function('x_radd_Y_as_Y') self.x_cadd_Y_as_Y_kernel = culinalg_kernel.get_function('x_cadd_Y_as_Y') self.x_rmul_Y_as_Y_kernel = culinalg_kernel.get_function('x_rmul_Y_as_Y') self.x_cmul_Y_as_Y_kernel = culinalg_kernel.get_function('x_cmul_Y_as_Y') self.x_radd_Y_as_x_kernel = culinalg_kernel.get_function('x_radd_Y_as_x') self.x_cadd_Y_as_x_kernel = culinalg_kernel.get_function('x_cadd_Y_as_x') self.x_outer_y_add_O_kernel = culinalg_kernel.get_function('x_outer_y_add_O') self.X_router_Y_add_O_kernel = culinalg_kernel.get_function('X_router_Y_add_O') self.X_rdot_Y_kernel = culinalg_kernel.get_function('X_rdot_Y') self.index_to_array_kernel = culinalg_kernel.get_function('index_to_array') self._2d_block = (32, 32, 1) self._1d_block = (1024, 1, 1) self._3d_block = (16, 16, 4)
from .parallel import get_id_within_node gpuid = get_id_within_node() import pycuda.driver pycuda.driver.init() if gpuid >= pycuda.driver.Device.count(): print '[' + MPI.Get_processor_name( ) + '] more processes than the GPU numbers!' #MPI.COMM_WORLD.Abort() raise gpu_device = pycuda.driver.Device(gpuid) gpu_context = gpu_device.make_context() gpu_initialized = True else: import pycuda.autoinit gpu_initialized = True except: pass try: from scikits.cuda import cublas import scikits.cuda.linalg as culinalg culinalg.init() cublas_handle = cublas.cublasCreate() except: pass def closeGPU(): if gpu_context is not None: gpu_context.detach()
def setUp(self): np.random.seed(0) linalg.init()
#!/usr/bin/env python from __future__ import division import h5py import sys import numpy as np import scipy.spatial.distance as ssd import pycuda.driver as drv from pycuda import gpuarray from scikits.cuda import linalg from lfd.tpsopt import tps linalg.init() from lfd.tpsopt.tps import tps_kernel_matrix, tps_eval from lfd.tpsopt.culinalg_exts import dot_batch_nocheck, get_gpu_ptrs from lfd.tpsopt.precompute import downsample_cloud, batch_get_sol_params from cuda_funcs import ( init_prob_nm, norm_prob_nm, get_targ_pts, check_cuda_err, fill_mat, reset_cuda, sq_diffs, closest_point_cost, scale_points, gram_mat_dist, )
import pycuda.driver as cuda import pycuda.compiler as compiler import pycuda.tools as tools from pycuda.compiler import SourceModule import numpy as np import scikits.cuda.linalg as culinalg import scikits.cuda.misc as cumisc import string from ctypes import * cdll.LoadLibrary("/usr/local/lib/libCudaKernelLibrary.so") kmeansLib = CDLL("/usr/local/lib/libCudaKernelLibrary.so") culinalg.init() # Double precision is only supported by devices with compute # capability >= 1.3: demo_types = [np.float32] if cumisc.get_compute_capability(pycuda.autoinit.device) >= 5.0: demo_types.extend([np.float64]) for t in demo_types: np.random.seed(seed=42) m = 899946 n = 129 k = 1024 print 'Testing matrix multiplication for type ' + str(np.dtype(t)) a = np.asarray(np.random.rand(m,n), t)
def setUp(self): linalg.init()
from abc import ABCMeta, abstractmethod import numpy as np import time as t import pycuda.driver as cuda from pycuda import gpuarray import pycuda.autoinit from pycuda.compiler import SourceModule from pycuda.elementwise import ElementwiseKernel import pycuda.curandom as curandom import pycuda.cumath as cumath import scikits.cuda.linalg as linalg linalg.init() class Layer: __metaclass__ = ABCMeta @abstractmethod def updateOutputs(self,inputs): pass @abstractmethod def updateGradient(self,previous_grad,include_prior): pass @abstractmethod def setWeights(self,new_weights): pass @abstractmethod
import time import pycuda.gpuarray as gpuarray import pycuda.autoinit import numpy as np import scikits.cuda.linalg as cla import numpy.linalg as la import scikits.cuda.cula as cula cla.init() def testForSize(x): print 'Image Size %dx%d' % (x,x) x = np.random.rand(x**2, 40).astype(np.float32) def svdoverwrite(a_gpu, u_gpu, s_gpu, v_gpu, m, n, lda, ldu, ldvt): data_type = a_gpu.dtype.type real_type = np.float32 cula_func = cula._libcula.culaDeviceSgesvd jobu = 'S' jobvt = 'S' status = cula_func(jobu, jobvt, m, n, int(a_gpu.gpudata), lda, int(s_gpu.gpudata), int(u_gpu.gpudata), ldu, int(v_gpu.gpudata), ldvt) cula.culaCheckStatus(status) # Free internal CULA memory: cula.culaFreeBuffers()
def gaussian_process(data, feedback, feedback_indices, float_type=np.float32, int_type=np.int32, kernel_file=None, debug=False, K_noise=None, K_xx_noise=None): #t = time.time() if kernel_file == None: kernel_file = os.path.dirname(os.path.realpath(__file__)) + '/kernels.c' if debug: print("Initialized starts") print("Loading test data") # with open('feedback.txt') as infile: # feedback = np.loadtxt(infile) # with open('feat.txt') as infile: # data = np.loadtxt(infile) # with open('feedback_idx.txt') as infile: # feedback_indices = np.loadtxt(infile) np.set_printoptions(linewidth=500) import pycuda.autoinit float_type = float_type int_type = int_type block_size = (8, 8, 16) n_features = np.int32(np.size(data, 1)) # TODO: Assuming the n_features is divisible by block_size[2] cuda_module = open(kernel_file, 'r').read() try: cuda_module = SourceModule(cuda_module) except Exception as e: print(e) # Inialize variables # Pad everything to match block size # Add zero row to the beginning of feature matrix for zero padding in cuda operations TODO: is this necessary? n_total = int_type(np.size(data, 0)) data = np.asfarray(data, dtype=float_type) n_feedback = np.size(feedback_indices, 0) n_feedback_padded = round_up_to_blocksize(n_feedback, block_size, int_type) # Pad to match block size feedback_indices = np.asarray(feedback_indices, dtype=int_type) predict_indices = np.setdiff1d(np.array([i for i in range(n_total)]), feedback_indices) n_predict = int_type(len(data) - len(feedback_indices)) n_predict_padded = round_up_to_blocksize(n_predict, block_size, int_type) feedback_indices = pad_vector(feedback_indices, n_feedback, n_feedback_padded, dtype=int_type) predict_indices = pad_vector(predict_indices, n_predict, n_predict_padded, dtype=int_type) K = np.zeros((n_feedback_padded, n_feedback_padded), dtype=float_type) K_x = np.zeros((n_predict_padded, n_feedback_padded), dtype=float_type) K_xK = np.zeros((n_predict_padded, n_feedback_padded), dtype=float_type) if K_noise is None: K_noise = np.random.normal(1, 0.1, n_feedback) # Generate diagonal noise # Save K diagonal noise K_noise = pad_vector(K_noise, n_feedback, n_feedback_padded, dtype=float_type) K_inv = np.asfarray(K, dtype=float_type) diag_K_xx = None if K_xx_noise is None: diag_K_xx = np.random.normal(1, 0.1, n_predict) else: diag_K_xx = K_xx_noise # Save K_xx random noise diag_K_xx = pad_vector(diag_K_xx, n_predict, n_predict_padded, dtype=float_type) diag_K_xKK_x_T = np.zeros((1, n_predict_padded), dtype=float_type) variance = np.zeros((1, n_predict_padded), dtype=float_type) feedback = np.array(feedback) feedback = pad_vector(feedback, n_feedback, n_feedback_padded, dtype=float_type) mean = np.zeros((1, n_predict_padded), dtype=float_type) # Allocate GPU memory and copy data, check datatype before each allocation # TODO: add dimension checking check_type(data, float_type) data_gpu = drv.mem_alloc(data.nbytes) drv.memcpy_htod(data_gpu, data) check_type(feedback_indices, int_type) feedback_indices_gpu = drv.mem_alloc(feedback_indices.nbytes) drv.memcpy_htod(feedback_indices_gpu, feedback_indices) check_type(K, float_type) K_gpu = drv.mem_alloc(K.nbytes) drv.memcpy_htod(K_gpu, K) check_type(K_inv, float_type) K_inv_gpu = drv.mem_alloc(K_inv.nbytes) check_type(K_noise, float_type) K_noise_gpu = drv.mem_alloc(K_noise.nbytes) drv.memcpy_htod(K_noise_gpu, K_noise) check_type(K_x, float_type) K_x_gpu = drv.mem_alloc(K_x.nbytes) drv.memcpy_htod(K_x_gpu, K_x) check_type(predict_indices, int_type) predict_indices_gpu = drv.mem_alloc(predict_indices.nbytes) drv.memcpy_htod(predict_indices_gpu, predict_indices) check_type(K_xK, float_type) K_xK_gpuarr = drv.mem_alloc(K_xK.nbytes) check_type(diag_K_xx, float_type) diag_K_xx_gpu = drv.mem_alloc(diag_K_xx.nbytes) drv.memcpy_htod(diag_K_xx_gpu, diag_K_xx) check_type(diag_K_xKK_x_T, float_type) diag_K_xKK_x_T_gpu = drv.mem_alloc(diag_K_xKK_x_T.nbytes) drv.memcpy_htod(diag_K_xKK_x_T_gpu, diag_K_xKK_x_T) check_type(variance, float_type) variance_gpu = drv.mem_alloc(variance.nbytes) check_type(feedback, float_type) feedback_gpu = drv.mem_alloc(feedback.nbytes) drv.memcpy_htod(feedback_gpu, feedback) check_type(mean, float_type) mean_gpu = drv.mem_alloc(mean.nbytes) # Initialization done # Actual GP calculations begin here calc_K(cuda_module, block_size, n_features, n_feedback_padded, data_gpu, feedback_indices_gpu, K_noise_gpu, K_gpu) drv.memcpy_dtoh(K, K_gpu) if debug: K_test_features = np.asfarray([data[i] for i in feedback_indices], dtype=float_type) K_test = dist.cdist(K_test_features, K_test_features, 'cityblock') / n_features + np.diag(K_noise) drv.memcpy_dtoh(K, K_gpu) check_result('K', K[:n_feedback, :n_feedback], K_test[:n_feedback, :n_feedback]) K_inv = invert_K(n_feedback, n_feedback_padded, float_type, K_gpu, K_inv_gpu) calc_K_x(cuda_module, block_size, n_feedback_padded, n_predict_padded, n_features, feedback_indices_gpu, predict_indices_gpu, data_gpu, K_x_gpu) drv.memcpy_dtoh(K_x, K_x_gpu) if debug: K_x_test = np.zeros((n_predict_padded, n_feedback_padded), dtype=float_type) for i, idx1 in enumerate(predict_indices): for j, idx2 in enumerate(feedback_indices): vdist = distance(data[idx1], data[idx2]) / len(data[0]) K_x_test[i][j] = vdist check_result('K_x', K_x[:n_predict, :n_feedback], K_x_test[:n_predict, :n_feedback]) linalg.init() K_inv_gpuarr = gpuarray.to_gpu(K_inv.astype(float_type)) K_x_gpuarr = gpuarray.to_gpu(K_x.astype(float_type)) K_xK_gpuarr = linalg.dot(K_x_gpuarr, K_inv_gpuarr) K_xK = K_xK_gpuarr.get() if debug: #drv.memcpy_dtoh(K_xK, K_xK_gpuarr) K_xK_test = (np.matrix(K_x) * np.matrix(K_inv)) check_result('K_xK', K_xK[:n_predict, :n_feedback], K_xK_test[:n_predict, :n_feedback]) print(K_xK.shape) calc_K_xKK_x_T(cuda_module, block_size, n_feedback_padded, n_predict_padded, K_xK_gpuarr, K_x_gpu, diag_K_xKK_x_T_gpu) drv.memcpy_dtoh(diag_K_xKK_x_T, diag_K_xKK_x_T_gpu) drv.memcpy_dtoh(diag_K_xKK_x_T, diag_K_xKK_x_T_gpu) if debug: K_xKK_x_T_test = np.diag(np.matrix(K_xK) * np.matrix(K_x).T) check_result("K_xKK_x_T", diag_K_xKK_x_T, K_xKK_x_T_test) calc_variance(cuda_module, block_size, n_predict_padded, diag_K_xx_gpu, diag_K_xKK_x_T_gpu, variance_gpu) drv.memcpy_dtoh(variance, variance_gpu) drv.memcpy_dtoh(variance, variance_gpu) if debug: variance_test = np.abs(np.subtract(diag_K_xx[:n_predict], diag_K_xKK_x_T[:, :n_predict])) check_result('Variance', variance[:, :n_predict], variance_test[:, :n_predict]) feedback = np.atleast_2d(feedback).T feedback_gpuarr = gpuarray.to_gpu(feedback.astype(float_type)) mean_gpuarr = linalg.dot(K_xK_gpuarr, feedback_gpuarr) mean = mean_gpuarr.get() #mean = np.dot(K_xK, feedback) if debug: #drv.memcpy_dtoh(mean, mean_gpu) mean_test = np.dot(K_xK, feedback) check_result('Mean', mean[:n_predict], mean_test[:n_predict]) if debug: # Calculate full result feedback = feedback[:n_feedback] feedback_indices = feedback_indices[:n_feedback] predict_indices = predict_indices[:n_predict] data = data test_K_noise = K_noise[:n_feedback] test_K_xx = diag_K_xx[:n_predict] test_K_features = np.asfarray([data[i] for i in feedback_indices], dtype=float_type) test_K = dist.cdist(test_K_features, test_K_features, 'cityblock') / n_features + np.diag(test_K_noise) test_K_inv = np.linalg.inv(test_K[:n_feedback, :n_feedback]) test_K_x = np.zeros((n_predict, n_feedback), dtype=float_type) for i, idx1 in enumerate(predict_indices): for j, idx2 in enumerate(feedback_indices): vdist = distance(data[idx1], data[idx2]) / len(data[0]) test_K_x[i][j] = vdist test_K_xK = np.dot(test_K_x, test_K_inv) test_K_xKK_x_T = np.diag(np.dot(test_K_xK, test_K_x.T)) test_variance = np.sqrt(np.abs(np.subtract(test_K_xx, test_K_xKK_x_T))) test_mean = np.dot(test_K_xK, feedback) print(np.allclose(variance.flatten()[:n_predict], test_variance)) print(np.allclose(mean[:n_predict], test_mean)) print('Variance isclose True count:', sum(np.isclose(variance.flatten()[:n_predict], test_variance))) print('Mean isclose True count:', sum(np.isclose(mean.flatten()[:n_predict], test_mean))) print('Mean differences (first 10):', np.subtract(mean.flatten()[:10], test_mean[:10])) print(mean.flatten()[:10]) print(test_mean[:10]) # Write results to files for testing mean = mean.flatten()[:n_predict] variance = variance.flatten()[:n_predict] #print(time.time() - t) return mean, variance