Exemplo n.º 1
0
    def _perform_sgemv(self, mat, v, vec_out, nvecs, dim):
        '''
        NOTES: cuBLAS uses Fortran layout
        cublas_sgemv is used to multiply matrix and vector (LEVEl 2 BLAS)

        cublas_handle   -> handle to the cuBLAS library context
        t               -> transpose
        dim             -> number of columns of matrix
        nvecs           -> number of rows of matrix
        alpha           -> scalar used for multiplication of mat
        mat.gpudata     -> matrix mat
        dim             -> columns of matrix
        v.gpudata       -> vector v
        incX            -> Stride within X. For example, if incX is 7, every 7th element is used.
        beta            -> scalar used for multiplication of v
        v_out.gpudata   -> result
        incY            -> Stride within Y. For example, if incx is 7, every 7th element is used

        Readmore        -> http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemv
        '''
        alpha = np.float32(1.0)
        beta = np.float32(0.0)

        incx = 1
        incy = 1

        cublas_handle = cublas.cublasCreate()

        cublas.cublasSgemv(cublas_handle, 't', dim, nvecs, alpha, mat.gpudata,
                           dim, v.gpudata, incx, beta, vec_out.gpudata, incy)

        cublas.cublasDestroy(cublas_handle)

        return vec_out
Exemplo n.º 2
0
    def _perform_sgemm(self, mat_a, mat_b, mat_out):
        nvecs_a, dim = mat_a.shape
        nvecs_b, dim = mat_b.shape

        alpha = np.float32(1.0)
        beta = np.float32(0.0)
        '''
        cublas_sgemm is used to  multiply matrix and matrix(LEVEL 3 BLAS)

        cublas_handle   -> handle to the cuBLAS-library context
        t               -> transpose mat_b
        n               -> notranspose mat_a
        nvecs_b         -> rows of mat_b
        nvecs_a         -> rows of mat_a
        dim             -> Common dimensions in mat_a and mat_b
        alpha           -> scaling factor for multiplication of mat_a and mat_b
        mat_b.gpudata   -> matrix mat_b
        dim             -> columns of mat_b
        mat_a.gpudata   -> matrix mat_a
        dim             -> columns of mat_a
        beta            -> scaling factor for r_gpu
        mat_out.gpudata -> matirx mat_out
        nvecs_b         -> rows of mat_b

        Read more       -> http://docs.nvidia.com/cuda/cublas/#cublas-lt-t-gt-gemm
        '''

        cublas_handle = cublas.cublasCreate()

        cublas.cublasSgemm(cublas_handle, 't', 'n', nvecs_b, nvecs_a, dim,
                           alpha, mat_b.gpudata, dim, mat_a.gpudata, dim, beta,
                           mat_out.gpudata, nvecs_b)
        cublas.cublasDestroy(cublas_handle)

        return mat_out
Exemplo n.º 3
0
def ijk_lu_decomposer_opt_gpu(M):

    m = M.shape[0]
    n = M.shape[1]

    import pycuda.autoinit
    import pycuda.gpuarray as gpuarray

    from skcuda.cublas import cublasCreate, cublasDestroy, cublasDdot#, cublasDscal
    import skcuda.misc as misc
    #import skcuda.linalg as linalg
    #linalg.init()

    N_gpu = gpuarray.to_gpu(M)

    h = cublasCreate()

    for i in range(0,n):
        for j in range(0,n):
            #N[i,j] -= N[i,:min(i,j)].dot(N[:min(i,j),j])
            N_gpu[i,j] -= cublasDdot(h, N_gpu[i,:min(i,j)].size, N_gpu[i,:min(i,j)].gpudata, 1, N_gpu[:min(i,j),j].gpudata, n)
            if j<i:
                N_gpu[i,j] /= N_gpu[j,j]
                #cublasDscal(h, N_gpu[i,j].size, 1.0/np.float64(N_gpu[j,j].get()), N_gpu[i,j].gpudata, 1)

    #Move from GPU to CPU
    N = N_gpu.get()

    cublasDestroy(h)

    return N
Exemplo n.º 4
0
def jki_lu_decomposer_opt_gpu(M):

    m = M.shape[0]
    n = M.shape[1]

    import pycuda.autoinit
    import pycuda.gpuarray as gpuarray

    from skcuda.cublas import cublasCreate, cublasDaxpy, cublasDscal, cublasDestroy
    import skcuda.misc as misc

    N_gpu = gpuarray.to_gpu(M)

    h = cublasCreate()

    for j in range(0,n):
        for k in range(0,j):

            #N[k+1:,j] = N[k+1:,j] - N[k+1:,k] * N[k,j]
            cublasDaxpy(h, N_gpu[k+1:,k].size, -np.float64(N_gpu[k,j].get()), N_gpu[k+1:,k].gpudata, n, N_gpu[k+1:,j].gpudata, n)

        #N[j+1:,j] /= N[j,j]
        cublasDscal(h, N_gpu[j+1:,j].size, 1.0/np.float64(N_gpu[j,j].get()), N_gpu[j+1:,j].gpudata, n)

    #Move from GPU to CPU
    N = N_gpu.get()

    cublasDestroy(h)

    return N
Exemplo n.º 5
0
def kij_lu_decomposer_opt_gpu(M):

    m = M.shape[0]
    n = M.shape[1]

    import pycuda.autoinit
    import pycuda.gpuarray as gpuarray

    from skcuda.cublas import cublasCreate, cublasDaxpy, cublasDestroy
    import skcuda.misc as misc

    N_gpu = gpuarray.to_gpu(M)

    h = cublasCreate()

    for k in range(0,n):
        for i in range(k+1,n):
            N_gpu[i,k] = N_gpu[i,k] / N_gpu[k,k]
            #N[i,k+1:] -= N[i,k] * N[k,k+1:]
            cublasDaxpy(h, N_gpu[k,k+1:].size, -np.float64(N_gpu[i,k].get()), N_gpu[k,k+1:].gpudata, 1, N_gpu[i,k+1:].gpudata, 1)

    #Move from GPU to CPU
    N = N_gpu.get()

    cublasDestroy(h)

    return N
Exemplo n.º 6
0
def GMRES_d(A_d, B, X0, tol, Nmax, precision):

    handler = cublas.cublasCreate()

    # Initialisation
    N = B.shape[0]
    Q = np.zeros((N, Nmax + 1))
    H = np.zeros((Nmax + 1, Nmax + 1))
    cs = np.zeros(Nmax)
    sn = np.zeros(Nmax)
    e1 = np.zeros(Nmax + 1)

    #Test preliminaire
    normB = np.linalg.norm(B)
    if normB == 0:
        print("Nombre d'iterations : 0")
        print("Erreur : ", 0)
        if precision == 2:
            return np.zeros(N, np.float64)
        else:
            return np.zeros(N, np.float32)

    # Residu initial
    Ax = product_Ax(A_d, X0, N, handler, precision)

    r0 = B - Ax
    Q[:, 0] = r0 / np.linalg.norm(r0)
    e1[0] = 1

    error = np.linalg.norm(r0) / np.linalg.norm(B)

    print("Error:", error)

    beta = np.linalg.norm(r0) * e1

    k = 0

    while (error > tol) and k < min(Nmax, N) - 1:

        Arnoldi_d(Q, A_d, H, k, handler, precision)

        apply_rotation(H, cs, sn, k)
        beta[k + 1] = -sn[k] * beta[k]
        beta[k] = cs[k] * beta[k]

        error = abs(beta[k + 1]) / np.linalg.norm(B)

        k += 1

    print("Nombre d'iterations : {}".format(k))
    print("Erreur :", error)

    Hinv = np.linalg.inv(H[:k, :k])

    y = np.matmul(Hinv[:k, :k], beta[:k])

    X = np.dot(Q[:, :k], y) + X0

    return X
Exemplo n.º 7
0
def init():
    import atexit

    def _shutdown_gpucsrarray():
        cublas.cublasDestroy(cublas_handle)
        cusparse.cusparseDestroy(cusparse_handle)

    global cublas_handle, cusparse_handle
    if cublas_handle is None or cusparse_handle is None:
        cublas_handle = cublas.cublasCreate()
        cusparse_handle = cusparse.cusparseCreate()
        atexit.register(_shutdown_gpucsrarray)
Exemplo n.º 8
0
def init():
    import atexit

    def _shutdown_gpucsrarray():
        cublas.cublasDestroy(cublas_handle)
        cusparse.cusparseDestroy(cusparse_handle)

    global cublas_handle, cusparse_handle
    if cublas_handle is None or cusparse_handle is None:
        cublas_handle = cublas.cublasCreate()
        cusparse_handle =  cusparse.cusparseCreate()
        atexit.register(_shutdown_gpucsrarray)
Exemplo n.º 9
0
def mult_BLAS():
	alpha 	= np.float64(1.0) # no prefactor
	beta 	= np.float64(0.0) # C matrix is not involved so beta = 0.0
	#m, k, n = ud.basis_size, ud.basis_size, ud.basis_size**2
	t0 = time.clock()
	for a in range(100):
		cublas.cublasDgemm(handle = cublas.cublasCreate(), 
							transa = 'n', transb = 'n',
							m 	= ud.i, n 	= ud.j_k, 		k = ud.i_prime,
							lda = ud.i, ldb = ud.i_prime, ldc = ud.i,
							alpha = alpha,  beta = beta, 
							A = T_gpu.gpudata, 
							B = v_x_gpu.gpudata, 
							C = U_x_gpu.gpudata, )
		cublas.cublasDgemm(handle = cublas.cublasCreate(), 
							transa = 'n', transb = 'n',
							m 	= ud.i, n 	= ud.j_k, 		k = ud.i_prime,
							lda = ud.i, ldb = ud.i_prime, ldc = ud.i,
							alpha = alpha,  beta = beta, 
							A = T_gpu.gpudata, 
							B = v_y_gpu.gpudata, 
							C = U_y_gpu.gpudata, )
		cublas.cublasDgemm(handle = cublas.cublasCreate(), 
							transa = 'n', transb = 'n',
							m 	= ud.i, n 	= ud.j_k, 		k = ud.i_prime,
							lda = ud.i, ldb = ud.i_prime, ldc = ud.i_prime,
							alpha = alpha,  beta = beta, 
							A = T_gpu.gpudata, 
							B = v_z_gpu.gpudata, 
							C = U_z_gpu.gpudata, )
		'''cublas.cublasDgemm(handle = cublas.cublasCreate(), 
							transa = 'n', transb = 'n',
							m 	= ud.i, n 	= ud.j_k, 		k = ud.i_prime,
							lda = ud.i, ldb = ud.i_prime, ldc = ud.i,
							alpha = alpha,  beta = beta, 
							A = pot_gpu.gpudata, 
							B = v_x_gpu.gpudata, 
							C = potential_gpu.gpudata, )'''
	print(time.clock() - t0, "mult_BLAS timer")
	return
Exemplo n.º 10
0
def get_cublas_handle():
    """Gets CUBLAS handle for the current device.

    Returns:
        CUBLAS handle.

    """
    global _cublas_handles

    device = Context.get_device()
    if device in _cublas_handles:
        return _cublas_handles[device]

    handle = cublas.cublasCreate()
    _cublas_handles[device] = handle
    return handle
Exemplo n.º 11
0
def get_cublas_handle():
    """Gets CUBLAS handle for the current device.

    Returns:
        CUBLAS handle.

    """
    global _cublas_handles

    device = Context.get_device()
    if device in _cublas_handles:
        return _cublas_handles[device]

    handle = cublas.cublasCreate()
    _cublas_handles[device] = handle
    return handle
Exemplo n.º 12
0
def compute_gflops(precision='S'):


	if precision=='S':
		float_type = 'float32'
	elif precision=='D':
		float_type = 'float64'
	else:
		return -1
		
		
	A = np.random.randn(m, k).astype(float_type)
	B = np.random.randn(k, n).astype(float_type)
	C = np.random.randn(m, n).astype(float_type)

	A_cm = A.T.copy()
	B_cm = B.T.copy()
	C_cm = C.T.copy()

	A_gpu = gpuarray.to_gpu(A_cm)
	B_gpu = gpuarray.to_gpu(B_cm)
	C_gpu = gpuarray.to_gpu(C_cm)

	alpha = np.random.randn()
	beta = np.random.randn()

	transa = cublas._CUBLAS_OP['N']
	transb = cublas._CUBLAS_OP['N']

	lda = m
	ldb = k
	ldc = m

	t = time()
	handle = cublas.cublasCreate()

	
	exec('cublas.cublas%sgemm(handle, transa, transb, m, n, k, alpha, A_gpu.gpudata, lda, \
						B_gpu.gpudata, ldb, beta, C_gpu.gpudata, ldc)' % precision)
	
	cublas.cublasDestroy(handle)
	t = time() - t

	gflops = 2*m*n*(k+1)*(10**-9) / t 
	
	return gflops
Exemplo n.º 13
0
def count_triangles_cublas(adjacency_list):
    driver.init()
    context = tools.make_default_context()
    h = cublas.cublasCreate()
    n = len(adjacency_list)
    A = np.zeros([n,n], dtype=np.float64)
    for row_idx, neighbor_list in adjacency_list:
        A[row_idx, neighbor_list] = 1.0
    a_gpu = gpuarray.to_gpu(A)
    b_gpu = gpuarray.empty(A.shape, A.dtype)
    c_gpu = gpuarray.empty(A.shape, A.dtype)
    one = np.float64(1.0)
    zero = np.float64(0.0)
    cublas.cublasDsymm(h, 'L', 'U', n, n, one, a_gpu.gpudata, n, a_gpu.gpudata, n, zero, b_gpu.gpudata, n)
    cublas.cublasDsymm(h, 'L', 'U', n, n, one, a_gpu.gpudata, n, b_gpu.gpudata, n, zero, c_gpu.gpudata, n)
    trace = linalg.trace(c_gpu, h)
    cublas.cublasDestroy(h)
    context.detach()
    return int(trace/6)
Exemplo n.º 14
0
def compute_gflops(precision='S'):
    if precision == 'S':
        float_type = 'float32'
    elif precision == 'D':
        float_type = 'float64'
    else:
        return -1

    # some random matrices that are of the appropriate precision that we will use for timing
    A = np.random.randn(m, k).astype(float_type)
    B = np.random.randn(k, n).astype(float_type)
    C = np.random.randn(m, n).astype(float_type)

    # wie gehabt
    A_cm = A.T.copy()
    B_cm = B.T.copy()
    C_cm = C.T.copy()
    A_gpu = gpuarray.to_gpu(A_cm)
    B_gpu = gpuarray.to_gpu(B_cm)
    C_gpu = gpuarray.to_gpu(C_cm)
    alpha = np.random.randn()
    beta = np.random.randn()
    transa = cublas._CUBLAS_OP['N']
    transb = cublas._CUBLAS_OP['N']
    lda = m
    ldb = k
    ldc = m

    t = time()
    handle = cublas.cublasCreate()

    # two different (relevant) precision modes, D and S
    exec(
        'cublas.cublas%sgemm(handle, transa, transb, m, n, k, alpha, A_gpu.gpudata, lda, \
						B_gpu.gpudata, ldb, beta, C_gpu.gpudata, ldc)' % precision)

    cublas.cublasDestroy(handle)
    t = time() - t

    # a total of 2kmn - mn + 3mn = 2kmn + 2mn = 2mn(k+1) floating point operations in a given GEMM operation
    gflops = 2 * m * n * (k + 1) * (10**-9) / t

    return gflops
Exemplo n.º 15
0
    def matmul(self, mat, return_time=False):
        '''Matrix multiplication between two matrices'''

        # check dimensions first:
        if self.ncols != mat.nrows:
            raise ValueError("Dimensions {0} and {1} do not match.".format(
                self.ncols, mat.nrows))

        # move matrices to gpu
        # somehow we need to transpose this to make it work, not sure why tho
        a_gpu = gpuarray.to_gpu(self.arr.T.copy())
        b_gpu = gpuarray.to_gpu(mat.arr.T.copy())
        c_gpu = gpuarray.to_gpu(
            np.zeros((self.nrows, mat.ncols)).astype(np.float32).T.copy())

        # initialize culas context
        h = cublasCreate()

        # evaluate the matrix multiplication
        # cubals syntax as follows:
        # h = handle, "n" : op(A) = A (for op(A) = A^t, use "t"),
        # then list m, n, k if we have m x k dot k x n
        # then first value is scalar in front of product (set to 1)
        # then give array data followed by row dim of each matrix
        # last value is for adding another matrix, set to 0.
        # also record the time it takes for the evaluation
        t0 = time.perf_counter_ns()
        cublasSgemm(h, "n", "n", self.nrows, mat.ncols, self.ncols,
                    np.float32(1.0), a_gpu.gpudata, self.nrows, b_gpu.gpudata,
                    mat.nrows, np.float32(0.0), c_gpu.gpudata, self.nrows)
        t1 = time.perf_counter_ns()

        eval_time = (t1 - t0) * (1e-9)  # time for each matmul evaluation

        # move from device to host
        prod_arr = c_gpu.get().T

        # free allocated memory for handle
        cublasDestroy(h)

        return (cublasMatrix(prod_arr),
                eval_time) if return_time else cublasMatrix(prod_arr)
Exemplo n.º 16
0
    def __init__(self, inputs, outputs, norm=None, precision=np.float64):
        super(SLFNSkCUDA, self).__init__(inputs, outputs, norm, precision)

        # startup GPU
        #self.ctx = misc.init_context(misc.init_device(nDevice))  # NO NO NO, crashes and does not release memory
        # use CUDA_DEVICE=0 python my-script.py
        try:
            linalg.init()
        except OSError as e:
            pass  # no 'cusolver' library which is paid and not needed
            # print "error initializing scikit-cuda: %s" % e
            # print "ignore if toolbox works"

        # precision-dependent stuff
        if precision is np.float64:
            self.posv = lapack.dposv
        else:
            self.posv = lapack.sposv
            self.handle = cublas.cublasCreate()

        # prepare GPU function kernels
        kernel = """
            __global__ void dev_sigm(%s *a) {
                unsigned idx = blockDim.x * blockIdx.x + threadIdx.x;
                a[idx] = 1.0 / ( exp(a[idx]) + 1 );
            }
            """
        kernel = kernel % "double" if self.precision is np.float64 else kernel % "float"
        self.dev_sigm = SourceModule(kernel).get_function("dev_sigm")
        self.dev_sigm.prepare("P")

        # GPU transformation functions
        self.func["lin"] = self._dev_lin
        self.func["sigm"] = self._dev_sigm
        self.func["tanh"] = self._dev_tanh
        self.func["rbf_l1"] = self._dev_rbfl1
        self.func["rbf_l2"] = self._dev_rbfl2
        self.func["rbf_linf"] = self._dev_rbflinf
Exemplo n.º 17
0
    def __init__(self, inputs, outputs, norm=None, precision=np.float64):
        super(SLFNSkCUDA, self).__init__(inputs, outputs, norm, precision)

        # startup GPU
        #self.ctx = misc.init_context(misc.init_device(nDevice))  # NO NO NO, crashes and does not release memory
        # use CUDA_DEVICE=0 python my-script.py
        try:
            linalg.init()
        except OSError as e:
            pass  # no 'cusolver' library which is paid and not needed
            # print "error initializing scikit-cuda: %s" % e
            # print "ignore if toolbox works"

        # precision-dependent stuff
        if precision is np.float64:
            self.posv = lapack.dposv
        else:
            self.posv = lapack.sposv
            self.handle = cublas.cublasCreate()

        # prepare GPU function kernels
        kernel = """
            __global__ void dev_sigm(%s *a) {
                unsigned idx = blockDim.x * blockIdx.x + threadIdx.x;
                a[idx] = 1.0 / ( exp(a[idx]) + 1 );
            }
            """
        kernel = kernel % "double" if self.precision is np.float64 else kernel % "float"
        self.dev_sigm = SourceModule(kernel).get_function("dev_sigm")
        self.dev_sigm.prepare("P")

        # GPU transformation functions
        self.func["lin"] = self._dev_lin
        self.func["sigm"] = self._dev_sigm
        self.func["tanh"] = self._dev_tanh
        self.func["rbf_l1"] = self._dev_rbfl1
        self.func["rbf_l2"] = self._dev_rbfl2
        self.func["rbf_linf"] = self._dev_rbflinf
def cor_mat_2(BOLD, upper_tri, N, L):
	# preprocessing fMRI data in CPU
	start_time = time.time()
	BOLD = preprocessing(BOLD, N, L)
	stop_time = time.time()
	delta = stop_time - start_time	
	print("Running time for preprocessing: ", delta, "\n")
	
	alpha = np.float32(1.0)
	beta = np.float32(0.0)

	# passaggio su device
	start_time = time.time()
	BOLD_device = gpuarray.to_gpu(BOLD)
	result = np.zeros((BOLD.shape[0], BOLD.shape[0]), np.float32)
	result_device = gpuarray.to_gpu(result)
	# print("BOLD_device shape:", BOLD_device.shape)
	# print("result_device shape:", result_device.shape)
	stop_time = time.time()
	delta = stop_time - start_time	
	print("Running time matrices to device: ", delta, "\n")

	start_time = time.time()
	h = cublas.cublasCreate()
	cublas.cublasSgemm(h,
					   'T',
					   'n',
					   N,
					   N,
					   L,
					   alpha,
					   BOLD_device.gpudata,
					   L,
					   BOLD_device.gpudata,
					   L,
					   beta,
					   result_device.gpudata,
					   N)
	stop_time = time.time()
	delta = stop_time - start_time	
	print("Running time core function: ", delta, "\n")

	start_time = time.time()
	threads_per_block = 1024
	blocks_per_grid = int(math.ceil(1 + ((N*N - 1) / threads_per_block)))
	mod = pycuda.compiler.SourceModule("""
		__global__ void ker(float * cormat, float * upper,int n1,int n)
		{
			long idx = blockDim.x*blockIdx.x+threadIdx.x;
			long i = idx%n1;
			long j = idx/n1;
			if(i<j && i<n1 && j<n)
			{
		        long tmp=i;
		        tmp*=(i+1);
		        tmp/=2;
		        long tmp_2=i;
		        tmp_2*=n;
		        tmp_2=tmp_2-tmp;
		        tmp_2+=j;
		        tmp_2-=i;
		        upper[tmp_2-1]=cormat[j*n+i];
			}
		}
		""")
	result_device = result_device.reshape(-1)
	# print("result device shape:", result_device.shape)
	upper_tri_device = gpuarray.to_gpu(upper_tri)
	funct = mod.get_function("ker")
	funct(result_device, 
		  upper_tri_device, 
		  np.int32(N),
		  np.int32(N),    
		  block=(threads_per_block, 1, 1),
          grid=(blocks_per_grid, 1)
          )
	upper_tri = upper_tri_device.get()
	stop_time = time.time()
	delta = stop_time - start_time	
	print("Running time to get upper tri: ", delta, "\n")
	
	cublas.cublasDestroy(h)
	
	return upper_tri
def cor_mat_3(BOLD, upper_tri, N, L, OOO):
	# calcolo memoria disponibile
	meminfo = cuda.mem_get_info()
	print("free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1]))
	available_mem = float(meminfo[0])
	available_mem /= np.dtype(np.float32).itemsize
	available_mem -= N * L
	# print("Available memory: ", available_mem)

	# preprocessing fMRI data in CPU
	start_time = time.time()
	BOLD = preprocessing(BOLD, N, L)
	stop_time = time.time()
	delta = stop_time - start_time	
	print("Running time for preprocessing: ", delta, "\n")


	# passaggio di BOLD in device
	start_time = time.time()
	BOLD_device = gpuarray.to_gpu(BOLD)
	stop_time = time.time()
	delta = stop_time - start_time	
	print("Running time matrices to device: ", delta, "\n")

	# calcolo memoria disponibile
	# meminfo = cuda.mem_get_info()
	# print("After BOLD_device free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1]))

	# inizializzazione variabili
	flag = 1
	ii=0
	upper_size = (N-1) * N / 2
	block = OOO
	N_prime = N
	temp = 0
	temp2 = 0
	temp3 = 0
	pak = 0
	so_far = 0
	count = 1
	temp4 = 0

	alpha = np.float32(1.0)
	beta = np.float32(0.0)

	while flag is 1:
		print("###### ITERAZIONE ", count, " #####")
		# calcolo memoria disponibile
		# meminfo = cuda.mem_get_info()
		# print("After BOLD_device free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1]))

		# print("block: ", block)
		# print("N_prime: ", N_prime)
		# checking for the last chunk
		if block == N_prime:
			flag = 0

		if pak is not 0:
			del dev_upper
			del result_device

		temp = block
		temp *= (block + 1)
		temp /= 2
		# M1 is the size of upper triangle part of chunk
		M1 = N_prime
		M1 *= block
		M1 -= temp

		M1 = int(M1)
		
		# print("M1: ", M1)
		
		pak += 1

		# print("so_far*L: ", so_far*L)
		start_time = time.time()
		result = np.zeros((block, N_prime), np.float32)
		# print("result shape: ", result.shape)

		BOLD_device = BOLD_device.reshape(-1)
		
		# allocate memory on the device for the result
		result_device = gpuarray.to_gpu(result)
		# print("result_device shape: ", result_device.shape)

		# # calcolo memoria disponibile
		# meminfo = cuda.mem_get_info()
		# print("Before cublasSgemm free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1]))
		stop_time = time.time()
		delta = stop_time - start_time	
		print("Running time matrices to device: ", delta, "\n")

		start_time = time.time()
		h = cublas.cublasCreate()
		cublas.cublasSgemm(h,
				   'T',
				   'n',
				   block,
				   N_prime,
				   L,
				   alpha,
				   BOLD_device[so_far*L:].gpudata,
				   L,
				   BOLD_device[so_far*L:].gpudata,
				   L,
				   beta,
				   result_device.gpudata,
				   block)
		stop_time = time.time()
		delta = stop_time - start_time	
		print("Running time core function: ", delta, "\n")

		temp2 = block
		temp2 *= N_prime

		# calcolo memoria disponibile
		# meminfo = cuda.mem_get_info()
		# print("free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1]))
		
		# result_device = gpuarray.to_gpu(result1)

		start_time = time.time()
		threads_per_block = 1024
		blocks_per_grid = 1 + math.ceil(((temp2-1) / threads_per_block))
		grid = (blocks_per_grid, 1)

		# print("temp2:", temp2)
		# print("threads_per_block: ", threads_per_block)
		# print("blocks_per_grid: ", blocks_per_grid)

		upper = np.zeros(M1, np.float32)
		# print("upper shape:", upper.shape)
		dev_upper = gpuarray.to_gpu(upper)

		# print("dev_upper shape: ", dev_upper.shape)
		# print("result_device shape: ", result_device.shape)

		mod = pycuda.compiler.SourceModule("""
			__global__ void ker2(float * cormat, float * upper,int n1,int n,long long upper_size,int N,int i_so_far,long long M1)
			{
				long long idx = blockDim.x;
				idx*=blockIdx.x;
				idx+=threadIdx.x;
				long i = idx/n;
				long j = idx%n;

				if(i<j && i<n1 && j<n)// &&i<N &&j<N && idx<(n1*n))
				{
				        long long tmp=i;
				        tmp*=(i+1);
				        tmp/=2;
				        long long tmp_2=i;
				        tmp_2*=n;
				        tmp_2=tmp_2-tmp;
				        tmp_2+=j;
				        tmp_2-=i;
				        long long indexi=n1;
				        indexi*=j;
				        indexi=indexi+i;
				        upper[tmp_2-1]=cormat[indexi];
				}
			}
  			""")
		# calcolo memoria disponibile
		# meminfo = cuda.mem_get_info()
		# print("free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1]))
		
		funct = mod.get_function("ker2")
		funct(result_device, 
			  dev_upper, 
			  np.int32(block), 
			  np.int32(N_prime),
			  np.int64(upper_size),
			  np.int32(N),
			  np.int32(ii), 
			  np.int64(M1),       
			  block=(threads_per_block, 1, 1),
              grid=grid
              )

		temp3+=M1
		# print("upper_tri shape:", upper_tri.shape)
		upper_tri[temp4:temp3] = dev_upper.get()
		stop_time = time.time()
		delta = stop_time - start_time	
		print("Running time to get upper tri: ", delta, "\n")

		temp4 += M1
		ii += block

		cublas.cublasDestroy(h)

		so_far += block

		if N_prime > block:
			N_prime = N_prime - block
			block = remaining_N2(N_prime, L, available_mem)
			if N_prime < block:
				block = N_prime

		count += 1

	# liberare la memoria 
	del BOLD_device
	del result_device
	del dev_upper

	# calcolo memoria disponibile
	# meminfo = cuda.mem_get_info()
	# print("free: %s bytes, total, %s bytes" % (meminfo[0], meminfo[1]))

	return upper_tri
Exemplo n.º 20
0
 def setUpClass(cls):
     cls.ctx = make_default_context()
     cls.cublas_handle = cublas.cublasCreate()
def register_multiple_images_subpix_cuda(stack, template):

    import pycuda.autoinit
    import pycuda.gpuarray as gpuarray
    import pycuda.driver as drv
    import pycuda.cumath as cumath
    import skcuda.fft as cu_fft
    import skcuda.linalg as lin
    import skcuda.cublas as cub
    from numpy import pi, newaxis, floor
    import cmath
    from pycuda.elementwise import ElementwiseKernel
    from pycuda.compiler import SourceModule

    from numpy import conj, abs, arctan2, sqrt, real, imag, shape, zeros, trunc, ceil, floor, fix
    from numpy.fft import fftshift, ifftshift
    fft2, ifft2 = fftn, ifftn = fast_ffts.get_ffts(nthreads=1,
                                                   use_numpy_fft=False)

    mod = SourceModule("""
   #include <pycuda-complex.hpp>"
   
    __global__ void load_convert(unsigned short *a, float *b,int f, int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        int offset = f * imlen;
        if (idx <imlen)
        {
            b[idx] = (float)a[offset+idx];
        }
    }
        
    __global__ void convert_export(float *a, unsigned short *b,int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            b[idx] = (unsigned short)(a[idx]>0 ? a[idx] : 0) ;
        }
    }
        
    __global__ void multiply_comp_float(pycuda::complex<float> *x, pycuda::complex<float> *y, pycuda::complex<float> *z, int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            z[idx] = x[idx] * y[idx];
        }
    }
        
    __global__ void calc_conj(pycuda::complex<float> *x, pycuda::complex<float> *y, int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            y[idx]._M_re = x[idx]._M_re;
            y[idx]._M_im = -x[idx]._M_im;
        }
    }
        
        
    __global__ void convert_multiply(float *x, pycuda::complex<float> *y, float sx, int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            y[idx]._M_re = 0;
            y[idx]._M_im = x[idx] * sx;
        }
    }
        
    __global__ void transfer_array(pycuda::complex<float> *x, pycuda::complex<float> *y, int imlenl, int imlen,  int nlargeh, int nh)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        int offset = imlenl*3/4;
        if (idx<imlen)
        {
            int target_ind = (offset+(idx/nh)*nlargeh + (idx % nh))%imlenl;
            x[target_ind] = y[idx];
        }      
    
    }    
        
    __global__ void calc_shiftmatrix(float *x, float *y, pycuda::complex<float> *z, float sx, float sy,float dg, int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            z[idx]._M_re = 0;
            z[idx]._M_im = x[idx] * sx + y[idx] * sy + dg;
        }
    }
        
    __global__ void sub_float(float *x, float *y, float sv,  int imlen)
    {
        int idx = (int) gridDim.x*blockDim.x*blockIdx.y+blockIdx.x *  blockDim.x +  threadIdx.x ;
        if (idx <imlen)
        {
            x[idx] = y[idx]-sv;
        }
    }
        

    """)

    load_convert_kernel = mod.get_function('load_convert')
    convert_export_kernel = mod.get_function('convert_export')
    convert_multiply_kernel = mod.get_function('convert_multiply')
    multiply_float_kernel = mod.get_function('multiply_comp_float')
    transfer_array_kernel = mod.get_function('transfer_array')
    calc_shiftmatrix_kernel = mod.get_function('calc_shiftmatrix')
    conj_kernel = mod.get_function('calc_conj')
    sub_float_kernel = mod.get_function('sub_float')

    Z = stack.shape[0]
    M = stack.shape[1]
    N = stack.shape[2]
    max_memsize = 4200000000

    imlen = M * N
    half_imlen = M * (N // 2 + 1)
    grid_dim = (64, int(imlen / (512 * 64)) + 1, 1)
    block_dim = (512, 1, 1)  #512 threads per block

    stack_bin = int(max_memsize / (M * N * stack.itemsize))
    stack_ite = int(Z / stack_bin) + 1

    usfac = 100  ## needs to be bigger than 10

    if not template.shape == stack.shape[1:]:
        raise ValueError("Images must have same shape.")

    if np.any(np.isnan(template)):
        template = template.copy()
        template[template != template] = 0
    if np.any(np.isnan(stack)):
        stack = stack.copy()
        stack[stack != stack] = 0

    mlarge = M * 2
    nlarge = N * 2

    t = time.time()

    plan_forward = cu_fft.Plan((M, N), np.float32, np.complex64)
    plan_inverse = cu_fft.Plan((M, N), np.complex64, np.float32)
    plan_inverse_big = cu_fft.Plan((mlarge, nlarge), np.complex64, np.float32)
    cub_h = cub.cublasCreate()

    template_gpu = gpuarray.to_gpu(template.astype('float32'))
    source_gpu = gpuarray.empty((M, N), np.float32)
    ifft_gpu = gpuarray.empty((M, N), np.float32)
    result_gpu = gpuarray.empty((M, N), np.uint16)

    templatef_gpu = gpuarray.empty((M, N // 2 + 1), np.complex64)
    sourcef_gpu = gpuarray.empty((M, N // 2 + 1), np.complex64)
    prod_gpu1 = gpuarray.empty((M, N // 2 + 1), np.complex64)
    prod_gpu2 = gpuarray.empty((M, N // 2 + 1), np.complex64)
    shiftmatrix = gpuarray.empty((M, N // 2 + 1), np.complex64)

    cu_fft.fft(template_gpu, templatef_gpu, plan_forward, scale=True)
    templatef_gpu = templatef_gpu.conj()

    move_list = np.zeros((Z, 2))

    largearray1_gpu = gpuarray.zeros((mlarge, nlarge // 2 + 1), np.complex64)
    largearray2_gpu = gpuarray.empty((mlarge, nlarge), np.float32)
    imlenl = mlarge * (nlarge // 2 + 1)

    zoom_factor = 1.5
    dftshiftG = trunc(ceil(usfac * zoom_factor) / 2)
    #% Center of output array at dftshift+1
    upsample_dim = int(ceil(usfac * zoom_factor))

    term1c = (ifftshift(np.arange(N, dtype='float') - floor(N / 2)).
              T[:, newaxis]) / N  # fftfreq  # output points
    term2c = ((np.arange(upsample_dim, dtype='float')) / usfac)[newaxis, :]
    term1r = (np.arange(upsample_dim, dtype='float').T)[:, newaxis]
    term2r = (ifftshift(np.arange(M, dtype='float')) -
              floor(M / 2))[newaxis, :]  # fftfreq
    term1c_gpu = gpuarray.to_gpu(term1c[:int(floor(N / 2) +
                                             1), :].astype('float32'))
    term2c_gpu = gpuarray.to_gpu(term2c.astype('float32'))
    term1r_gpu = gpuarray.to_gpu(term1r.astype('float32'))
    term2r_gpu = gpuarray.to_gpu(term2r.astype('float32'))
    term2c_gpu_ori = gpuarray.to_gpu(term2c.astype('float32'))
    term1r_gpu_ori = gpuarray.to_gpu(term1r.astype('float32'))

    kernc_gpu = gpuarray.zeros((N // 2 + 1, upsample_dim), np.float32)
    kernr_gpu = gpuarray.zeros((upsample_dim, M), np.float32)
    kernc_gpuc = gpuarray.zeros((N // 2 + 1, upsample_dim), np.complex64)
    kernr_gpuc = gpuarray.zeros((upsample_dim, M), np.complex64)

    Nr = np.fft.ifftshift(np.linspace(-np.fix(M / 2), np.ceil(M / 2) - 1, M))
    Nc = np.fft.ifftshift(np.linspace(-np.fix(N / 2), np.ceil(N / 2) - 1, N))
    [Nc, Nr] = np.meshgrid(Nc, Nr)
    Nc_gpu = gpuarray.to_gpu((Nc[:, :N // 2 + 1] / N).astype('float32'))
    Nr_gpu = gpuarray.to_gpu((Nr[:, :N // 2 + 1] / M).astype('float32'))

    upsampled1 = gpuarray.empty((upsample_dim, N // 2 + 1), np.complex64)
    upsampled2 = gpuarray.empty((upsample_dim, upsample_dim), np.complex64)

    source_stack = gpuarray.empty((stack_bin, M, N), dtype=stack.dtype)
    copy = drv.Memcpy3D()
    copy.set_src_host(stack.data)
    copy.set_dst_device(source_stack.gpudata)
    copy.width_in_bytes = copy.src_pitch = stack.strides[1]
    copy.src_height = copy.height = M

    for zb in range(stack_ite):

        zrange = np.arange(zb * stack_bin, min((stack_bin * (zb + 1)), Z))
        copy.depth = len(zrange)
        copy.src_z = int(zrange[0])
        copy()

        for i in range(len(zrange)):

            t = zb * stack_bin + i
            load_convert_kernel(source_stack,
                                source_gpu.gpudata,
                                np.int32(i),
                                np.int32(imlen),
                                block=block_dim,
                                grid=grid_dim)
            cu_fft.fft(source_gpu, sourcef_gpu, plan_forward, scale=True)

            multiply_float_kernel(sourcef_gpu,
                                  templatef_gpu,
                                  prod_gpu1,
                                  np.int32(half_imlen),
                                  block=block_dim,
                                  grid=grid_dim)
            transfer_array_kernel(largearray1_gpu,
                                  prod_gpu1,
                                  np.int32(imlenl),
                                  np.int32(half_imlen),
                                  np.int32(nlarge // 2 + 1),
                                  np.int32(N // 2 + 1),
                                  block=block_dim,
                                  grid=grid_dim)
            cu_fft.ifft(largearray1_gpu,
                        largearray2_gpu,
                        plan_inverse_big,
                        scale=True)
            peakind = cub.cublasIsamax(cub_h, largearray2_gpu.size,
                                       largearray2_gpu.gpudata, 1)
            rloc, cloc = np.unravel_index(peakind, largearray2_gpu.shape)

            md2 = trunc(mlarge / 2)
            nd2 = trunc(nlarge / 2)
            if rloc > md2:
                row_shift2 = rloc - mlarge
            else:
                row_shift2 = rloc
            if cloc > nd2:
                col_shift2 = cloc - nlarge
            else:
                col_shift2 = cloc
            row_shiftG = row_shift2 / 2.
            col_shiftG = col_shift2 / 2.

            # Initial shift estimate in upsampled grid

            row_shiftG0 = round(row_shiftG * usfac) / usfac
            col_shiftG0 = round(col_shiftG * usfac) / usfac
            # Matrix multiply DFT around the current shift estimate
            roffG = dftshiftG - row_shiftG0 * usfac
            coffG = dftshiftG - col_shiftG0 * usfac

            sub_float_kernel(term2c_gpu,
                             term2c_gpu_ori,
                             np.float32(coffG / usfac),
                             np.int32(term2c_gpu.size),
                             block=block_dim,
                             grid=grid_dim)
            sub_float_kernel(term1r_gpu,
                             term1r_gpu_ori,
                             np.float32(roffG),
                             np.int32(term1r_gpu.size),
                             block=block_dim,
                             grid=grid_dim)

            lin.dot(term1c_gpu, term2c_gpu, handle=cub_h, out=kernc_gpu)
            lin.dot(term1r_gpu, term2r_gpu, handle=cub_h, out=kernr_gpu)
            convert_multiply_kernel(kernc_gpu,
                                    kernc_gpuc,
                                    np.float32(-2 * pi),
                                    np.int32(kernc_gpu.size),
                                    block=block_dim,
                                    grid=grid_dim)
            convert_multiply_kernel(kernr_gpu,
                                    kernr_gpuc,
                                    np.float32(-2 * pi / (M * usfac)),
                                    np.int32(kernr_gpu.size),
                                    block=block_dim,
                                    grid=grid_dim)
            cumath.exp(kernc_gpuc, out=kernc_gpuc)
            cumath.exp(kernr_gpuc, out=kernr_gpuc)

            conj_kernel(prod_gpu1,
                        prod_gpu2,
                        np.int32(half_imlen),
                        block=block_dim,
                        grid=grid_dim)

            lin.dot(kernr_gpuc, prod_gpu2, handle=cub_h, out=upsampled1)
            lin.dot(upsampled1, kernc_gpuc, handle=cub_h, out=upsampled2)

            CCG = conj(upsampled2.get()) / (md2 * nd2 * usfac**2)
            rlocG, clocG = np.unravel_index(abs(CCG).argmax(), CCG.shape)
            CCGmax = CCG[rlocG, clocG]

            rlocG = rlocG - dftshiftG  #+ 1 # +1 # questionable/failed hack + 1;
            clocG = clocG - dftshiftG  #+ 1 # -1 # questionable/failed hack - 1;
            row_shiftG = row_shiftG0 + rlocG / usfac
            col_shiftG = col_shiftG0 + clocG / usfac

            diffphaseG = arctan2(imag(CCGmax), real(CCGmax))

            # Compute registered version of source stack
            calc_shiftmatrix_kernel(Nr_gpu,
                                    Nc_gpu,
                                    shiftmatrix,
                                    np.float32(row_shiftG * 2 * np.pi),
                                    np.float32(col_shiftG * 2 * np.pi),
                                    np.float32(diffphaseG),
                                    np.int32(half_imlen),
                                    block=block_dim,
                                    grid=grid_dim)
            cumath.exp(shiftmatrix, out=shiftmatrix)
            multiply_float_kernel(sourcef_gpu,
                                  shiftmatrix,
                                  prod_gpu1,
                                  np.int32(half_imlen),
                                  block=block_dim,
                                  grid=grid_dim)
            cu_fft.ifft(prod_gpu1, ifft_gpu, plan_inverse)
            convert_export_kernel(ifft_gpu,
                                  result_gpu,
                                  np.int32(imlen),
                                  block=block_dim,
                                  grid=grid_dim)

            move_list[t, :] = (row_shiftG, col_shiftG)
            stack[t, :, :] = result_gpu.get()

    cub.cublasDestroy(cub_h)
    return (stack, move_list)
Exemplo n.º 22
0
def attach_cublas_handle_to_context(ctx):
    handle = getattr(ctx, "cublas_handle", None)
    if handle is None:
        with ctx:
            ctx.cublas_handle = cublas.cublasCreate()
Exemplo n.º 23
0
 def setUpClass(cls):
     cls.ctx = make_default_context()
     cls.cublas_handle = cublas.cublasCreate()
Exemplo n.º 24
0
 def create(self):
     if self.handle is None:
         self.handle = cublas.cublasCreate()
Exemplo n.º 25
0
import pycuda.autoinit
from pycuda import gpuarray
import numpy as np

from skcuda import cublas

a = np.float32(10)
x = np.float32([1, 2, 3])

y = np.float32([-.345, 8.15, -15.867])

x_gpu = gpuarray.to_gpu(x)
y_gpu = gpuarray.to_gpu(y)

cublas_context_h = cublas.cublasCreate()

cublas.cublasSaxpy(cublas_context_h, x_gpu.size, a, x_gpu.gpudata, 1, y_gpu.gpudata, 1)

cublas.cublasDestroy(cublas_context_h)

print("This is close to the NumPy approximation: {}".format(np.allclose(a * x + y, y_gpu.get())))
Exemplo n.º 26
0
b_g = gpuarray.to_gpu(b_c)
# allocating a 2x2 matrix filled with zeros:
c_g = gpuarray.zeros((2,2), dtype = np.float32)

# printing the process(GPU):
print('GPU matrix-matrix multiplication:\n')
print('a_g = \n', a_g.get(), '\n')
print('b_g = \n', b_g.get(), '\n')

# storing matrix-matrix product of (a_g) and (b_g) in (c_g):
start_gpu = drv.Event()
end_gpu = drv.Event()

start_gpu.record()

cublas.cublasSgemm(cublas.cublasCreate(),
			transa = 'n',
			transb = 't',
			m = 2, n = 2, k = 2,
			alpha = np.float32(1.0),
			A = a_g.gpudata, lda = 2,
			B = b_g.gpudata, ldb = 2,
			beta = np.float32(0.0), C = c_g.gpudata, ldc = 2)


end_gpu.record()
end_gpu.synchronize()
gpu_time = end_gpu.time_till(start_gpu)

# printing GPU time and product:
print('GPU time:', gpu_time, '\n')
Exemplo n.º 27
0
    def __init__(self,
                 input_shape=None,
                 i_gpu=_DEFAULT_GPU,
                 cudaDevice=None,
                 cudaContext=None,
                 cudnnContext=None,
                 cublasContext=None,
                 verbose=None,
                 maxiter=None,
                 keeptime=None,
                 keepvars=None,
                 stepsize=None,
                 parameters=None):
        """ Initialize contexts if needed """

        self.i_gpu = i_gpu
        self.floatX = _DEFAULT_FLOATX
        self.verbose = verbose
        self.maxiter = maxiter
        self.keeptime = keeptime
        self.keepvars = keepvars
        self.stepsize = stepsize
        self.input_shape = input_shape
        self.cudaDevice = cudaDevice
        self.cudaContext = cudaContext
        self.cudnnContext = cudnnContext
        self.cublasContext = cublasContext
        ############################################################
        # define circuit parameters as a constant named tuple rather
        # than a dict to avoid inappropriate modification by user
        ############################################################
        try:
            for pkey, pval in parameters.iteritems():
                _DEFAULT_PARAMETERS_TEMPLATE[pkey] = pval
        except AttributeError:
            pass
        finally:
            self.parameters = CircuitParameters(**_DEFAULT_PARAMETERS_TEMPLATE)

        if self.verbose is None:
            self.verbose = _DEFAULT_VERBOSE
        if self.maxiter is None:
            self.maxiter = _DEFAULT_MAXITER
        if self.keeptime is None:
            self.keeptime = _DEFAULT_KEEPTIME
        if self.keepvars is None:
            self.keepvars = _DEFAULT_KEEPVARS
        if self.stepsize is None:
            self.stepsize = _DEFAULT_STEPSIZE

        if self.input_shape is not None:
            self._sanity_check()

        if self.cudaDevice is None:
            cuda_driver.init()
            self.cudaDevice = cuda_driver.Device(self.i_gpu)
        if self.cudaContext is None:
            self.cudaContext = self.cudaDevice.make_context()
        if self.cudnnContext is None:
            self.cudnnContext = cudnn.cudnnCreate()
        if self.cublasContext is None:
            self.cublasContext = cublas.cublasCreate()

        # if input shape is known, initialize now
        if self.input_shape is not None:
            self._prepare_kernels()
            self._prepare_tensors()
Exemplo n.º 28
0
 def create(self):
     if self.handle is None:
         self.handle = cublas.cublasCreate()
Exemplo n.º 29
0
def omnical(ggu_indices,
            gains,
            ubls,
            data,
            wgts,
            conv_crit=1e-10,
            maxiter=50,
            check_every=4,
            check_after=1,
            gain=0.3,
            nthreads=NTHREADS,
            precision=1,
            verbose=False):
    '''CPU-side function for organizing GPU-acceleration primitives into
    a cohesive omnical algorithm.
        
    Args:
        ggu_indices: (nbls,3) array of (i,j,k) indices denoting data order
                     as gains[i] * gains[j].conj() * ubl[k]
        gains: (ndata, nants) array of estimated complex gains
        ubls: (ndata, nubls) array of estimated complex unique baselines
        data: (ndata, nbls) array of data to be calibrated
        wgts: (ndata, nbls) array of weights for each data
        conv_crit: maximum allowed relative change in solutions to be 
            considered converged
        maxiter: maximum number of omnical iterations allowed before it
            gives up.
        check_every: Compute convergence every Nth iteration (saves 
            computation).  Default 4.
        check_after: Start computing convergence only after N iterations.  
            Default 1.
        gain: The fractional step made toward the new solution each 
            iteration. Values in the range 0.1 to 0.5 are generally safe.  
            Increasing values trade speed for stability.  Default is 0.3.
        nthreads: Number of GPU threads to use. Default NTHREADS=1024
        precision: 1=float32, 2=double (float64). Default 1.
        verbose: If True, print things. Default False.

    Returns:
        info dictionary of {'gains':gains, 'ubls':ubls, 'chisq':chisq, 
            'iters':iters, 'conv':conv}
    '''
    # Sanity check input array dimensions
    nbls = ggu_indices.shape[0]
    assert ggu_indices.shape == (nbls, 3)
    ndata = data.shape[0]
    assert data.shape == (ndata, nbls)
    assert wgts.shape == (ndata, nbls)
    nants = gains.shape[1]
    assert gains.shape == (ndata, nants)
    nubls = ubls.shape[1]
    assert ubls.shape == (ndata, nubls)
    assert precision in (1, 2)
    assert check_every > 1
    if verbose:
        print('PRECISION:', precision)
        print('NDATA:', ndata)
        print('NANTS:', nants)
        print('NUBLS:', nubls)
        print('NBLS:', nbls)

    # Choose between float/double primitives
    if precision == 1:
        real_dtype, complex_dtype = np.float32, np.complex64
        DTYPE, CDTYPE = 'float', 'cuFloatComplex'
        CMULT, CONJ, CSUB = 'cuCmulf', 'cuConjf', 'cuCsubf'
        COPY = cublasCcopy
    else:
        real_dtype, complex_dtype = np.float64, np.complex128
        DTYPE, CDTYPE = 'double', 'cuDoubleComplex'
        CMULT, CONJ, CSUB = 'cuCmul', 'cuConj', 'cuCsub'
        COPY = cublasZcopy

    # ensure data types
    ggu_indices = ggu_indices.astype(np.uint32)
    data = data.astype(complex_dtype)
    wgts = wgts.astype(real_dtype)
    gains = gains.astype(complex_dtype)
    ubls = ubls.astype(complex_dtype)

    # Profiling info used to determine optimal chunk size:
    # Model: 15.3 s + ndata/140 => px=8192 in 75 s on Quadro T2000
    # px=2000, nants=126, precision=2
    #chunk_size = min(nthreads // 2, ndata) # 8.5 s
    #chunk_size = min(nthreads // 4, ndata) # 6.7 s
    #chunk_size = min(nthreads // 8, ndata) # 5.6 s
    #chunk_size = min(nthreads // 16, ndata) # 5.2 s
    #chunk_size = min(nthreads // 32, ndata) # 5.4 s
    #chunk_size = min(nthreads // 64, ndata) # 5.9 s
    # px=2000, nants=54, precision=2
    #chunk_size = min(nthreads // 2, ndata) # 1.5 s
    #chunk_size = min(nthreads // 4, ndata) # 1.3 s
    #chunk_size = min(nthreads // 8, ndata) # 1.3 s
    #chunk_size = min(nthreads // 16, ndata) # 1.5 s
    #chunk_size = min(nthreads // 32, ndata) # 1.9 s
    #chunk_size = min(nthreads // 64, ndata) # 3.0 s
    # px=2000, nants=180, precision=2
    #chunk_size = min(nthreads // 2, ndata) # 16.8 s
    #chunk_size = min(nthreads // 4, ndata) # 13.8 s
    #chunk_size = min(nthreads // 8, ndata) # 11.1 s
    #chunk_size = min(nthreads // 16, ndata) # 9.9 s
    #chunk_size = min(nthreads // 32, ndata) # 10.0 s
    #chunk_size = min(nthreads // 64, ndata) # 10.14 s
    # px=2000, nants=180, precision=1
    #chunk_size = min(nthreads // 2, ndata) # 7.6 s
    #chunk_size = min(nthreads // 4, ndata) # 8.3 s
    #chunk_size = min(nthreads // 8, ndata) # 7.4 s
    #chunk_size = min(nthreads // 16, ndata) # 7.3 s
    #chunk_size = min(nthreads // 32, ndata) # 7.4 s
    #chunk_size = min(nthreads // 64, ndata) # 7.5 s
    # Upshot: nthreads // 16 seems to cover most cases, no need to tune.

    # Build the CUDA code
    gpu_code = GPU_TEMPLATE.format(
        **{
            'NBLS': nbls,
            'NUBLS': nubls,
            'NANTS': nants,
            'GAIN': gain,
            'CMULT': CMULT,
            'CONJ': CONJ,
            'CSUB': CSUB,
            'DTYPE': DTYPE,
            'CDTYPE': CDTYPE,
        })
    # Extract functions from CUDA, suffix _cuda indicates GPU operation
    gpu_module = compiler.SourceModule(gpu_code)
    gen_dmdl_cuda = gpu_module.get_function("gen_dmdl")
    calc_chisq_cuda = gpu_module.get_function("calc_chisq")
    calc_dwgts_cuda = gpu_module.get_function("calc_dwgts")
    calc_gu_wgt_cuda = gpu_module.get_function("calc_gu_wgt")
    calc_gu_buf_cuda = gpu_module.get_function("calc_gu_buf")
    clear_complex_cuda = gpu_module.get_function("clear_complex")
    set_val_real_cuda = gpu_module.get_function("set_val_real")
    set_val_uint_cuda = gpu_module.get_function("set_val_uint")
    update_gains_cuda = gpu_module.get_function("update_gains")
    update_ubls_cuda = gpu_module.get_function("update_ubls")
    calc_conv_cuda = gpu_module.get_function("calc_conv")
    update_active_cuda = gpu_module.get_function("update_active")

    h = cublasCreate()  # handle for managing cublas, used for buffer copies

    # define GPU buffers, suffix _g indicates GPU buffer
    chunk_size = min(nthreads // 16, ndata)
    data_chunks = 1
    ANT_SHAPE = (chunk_size, nants)
    UBL_SHAPE = (chunk_size, nubls)
    BLS_SHAPE = (chunk_size, nbls)
    block = (chunk_size, int(np.floor(nthreads / chunk_size)), 1)
    ant_grid = (data_chunks, int(np.ceil(nants / block[1])))
    ubl_grid = (data_chunks, int(np.ceil(nubls / block[1])))
    bls_grid = (data_chunks, int(np.ceil(nbls / block[1])))
    conv_grid = (data_chunks, int(np.ceil(max(nants, nubls) / block[1])))
    if verbose:
        print('GPU block:', block)
        print('ANT grid:', ant_grid)
        print('UBL grid:', ubl_grid)
        print('BLS grid:', bls_grid)

    ggu_indices_g = gpuarray.empty(shape=ggu_indices.shape, dtype=np.uint32)
    active_g = gpuarray.empty(shape=(chunk_size, ), dtype=np.uint32)
    iters_g = gpuarray.empty(shape=(chunk_size, ), dtype=np.uint32)

    gains_g = gpuarray.empty(shape=ANT_SHAPE, dtype=complex_dtype)
    new_gains_g = gpuarray.empty(shape=ANT_SHAPE, dtype=complex_dtype)
    gbuf_g = gpuarray.empty(shape=ANT_SHAPE, dtype=complex_dtype)
    gwgt_g = gpuarray.empty(shape=ANT_SHAPE, dtype=real_dtype)

    ubls_g = gpuarray.empty(shape=UBL_SHAPE, dtype=complex_dtype)
    new_ubls_g = gpuarray.empty(shape=UBL_SHAPE, dtype=complex_dtype)
    ubuf_g = gpuarray.empty(shape=UBL_SHAPE, dtype=complex_dtype)
    uwgt_g = gpuarray.empty(shape=UBL_SHAPE, dtype=real_dtype)

    data_g = gpuarray.empty(shape=BLS_SHAPE, dtype=complex_dtype)
    dmdl_g = gpuarray.empty(shape=BLS_SHAPE, dtype=complex_dtype)
    wgts_g = gpuarray.empty(shape=BLS_SHAPE, dtype=real_dtype)
    dwgts_g = gpuarray.empty(shape=BLS_SHAPE, dtype=real_dtype)

    chisq_g = gpuarray.empty(shape=(chunk_size, ), dtype=real_dtype)
    new_chisq_g = gpuarray.empty(shape=(chunk_size, ), dtype=real_dtype)
    conv_sum_g = gpuarray.empty(shape=(chunk_size, ), dtype=real_dtype)
    conv_wgt_g = gpuarray.empty(shape=(chunk_size, ), dtype=real_dtype)
    conv_g = gpuarray.empty(shape=(chunk_size, ), dtype=real_dtype)

    # Define return buffers
    chisq = np.empty((ndata, ), dtype=real_dtype)
    conv = np.empty((ndata, ), dtype=real_dtype)
    iters = np.empty((ndata, ), dtype=np.uint32)
    active = np.empty((chunk_size, ), dtype=np.uint32)

    # upload data indices
    ggu_indices_g.set_async(ggu_indices)

    # initialize structures used to time code
    event_order = ('start', 'upload', 'dmdl', 'calc_chisq', 'loop_top',
                   'calc_gu_wgt', 'calc_gu_buf', 'copy_gains', 'copy_ubls',
                   'update_gains', 'update_ubls', 'dmdl2', 'chisq2',
                   'calc_conv', 'update_active', 'get_active', 'end')
    event_pairs = list((event_order[i], event_order[i + 1])
                       for i in range(len(event_order[:-1])))
    cum_time = {}
    t0 = time.time()

    # Loop over chunks of parallel omnical problems
    for px in range(0, ndata, chunk_size):
        events = {e: driver.Event() for e in event_order}
        events['start'].record()

        end = min(ndata, px + chunk_size)
        beg = end - chunk_size
        offset = px - beg
        gains_g.set_async(gains[beg:end])
        ubls_g.set_async(ubls[beg:end])
        data_g.set_async(data[beg:end])
        wgts_g.set_async(wgts[beg:end])
        active = np.ones((chunk_size, ), dtype=np.uint32)
        if offset > 0:
            active[:offset] = 0
        active_g.set_async(active)
        set_val_real_cuda(conv_sum_g,
                          np.uint32(chunk_size),
                          real_dtype(0),
                          block=(chunk_size, 1, 1),
                          grid=(1, 1))
        set_val_real_cuda(conv_wgt_g,
                          np.uint32(chunk_size),
                          real_dtype(0),
                          block=(chunk_size, 1, 1),
                          grid=(1, 1))
        events['upload'].record()

        gen_dmdl_cuda(ggu_indices_g,
                      gains_g,
                      ubls_g,
                      dmdl_g,
                      active_g,
                      block=block,
                      grid=bls_grid)
        events['dmdl'].record()

        set_val_real_cuda(chisq_g,
                          np.uint32(chunk_size),
                          real_dtype(0),
                          block=(chunk_size, 1, 1),
                          grid=(1, 1))
        calc_chisq_cuda(data_g,
                        dmdl_g,
                        wgts_g,
                        chisq_g,
                        active_g,
                        block=block,
                        grid=bls_grid)
        events['calc_chisq'].record()

        if TIME_IT:
            events['calc_chisq'].synchronize()
            for (e1, e2) in event_pairs[:3]:
                cum_time[(e1,e2)] = cum_time.get((e1,e2), 0) + \
                                    events[e2].time_since(events[e1])

        # Loop over iterations within an omnical problem
        for i in range(1, maxiter + 1):
            events['loop_top'].record()

            if (i % check_every) == 1:
                # Per standard omnical algorithm, only update gwgt/uwgt
                # every few iterations to save compute.
                calc_dwgts_cuda(dmdl_g,
                                wgts_g,
                                dwgts_g,
                                active_g,
                                block=block,
                                grid=bls_grid)
                set_val_real_cuda(
                    gwgt_g,
                    np.uint32(nants * chunk_size),
                    real_dtype(0),
                    block=(nthreads, 1, 1),
                    grid=(int(np.ceil(nants * chunk_size / nthreads)), 1))
                set_val_real_cuda(
                    uwgt_g,
                    np.uint32(nubls * chunk_size),
                    real_dtype(0),
                    block=(nthreads, 1, 1),
                    grid=(int(np.ceil(nubls * chunk_size / nthreads)), 1))
                calc_gu_wgt_cuda(ggu_indices_g,
                                 dmdl_g,
                                 dwgts_g,
                                 gwgt_g,
                                 uwgt_g,
                                 active_g,
                                 block=block,
                                 grid=bls_grid)
            events['calc_gu_wgt'].record()

            clear_complex_cuda(gbuf_g,
                               np.uint32(nants * chunk_size),
                               block=(nthreads, 1, 1),
                               grid=(int(np.ceil(nants * chunk_size /
                                                 nthreads)), 1))
            clear_complex_cuda(ubuf_g,
                               np.uint32(nubls * chunk_size),
                               block=(nthreads, 1, 1),
                               grid=(int(np.ceil(nubls * chunk_size /
                                                 nthreads)), 1))
            # This is 75% of the compute load
            calc_gu_buf_cuda(ggu_indices_g,
                             data_g,
                             dwgts_g,
                             dmdl_g,
                             gbuf_g,
                             ubuf_g,
                             active_g,
                             block=block,
                             grid=bls_grid)
            events['calc_gu_buf'].record()

            if (i < maxiter) and (i < check_after or (i % check_every != 0)):
                # Fast branch: don't check convergence/divergence
                events['copy_gains'].record()
                events['copy_ubls'].record()

                update_gains_cuda(gbuf_g,
                                  gwgt_g,
                                  gains_g,
                                  np.float32(gain),
                                  active_g,
                                  block=block,
                                  grid=ant_grid)
                events['update_gains'].record()

                update_ubls_cuda(ubuf_g,
                                 uwgt_g,
                                 ubls_g,
                                 np.float32(gain),
                                 active_g,
                                 block=block,
                                 grid=ubl_grid)
                events['update_ubls'].record()

                gen_dmdl_cuda(ggu_indices_g,
                              gains_g,
                              ubls_g,
                              dmdl_g,
                              active_g,
                              block=block,
                              grid=bls_grid)
                events['dmdl2'].record()

                events['chisq2'].record()
                events['calc_conv'].record()
                events['update_active'].record()
                events['get_active'].record()

            else:
                # Slow branch: check convergence/divergence
                COPY(h, nants * chunk_size, gains_g.gpudata, 1,
                     new_gains_g.gpudata, 1)
                events['copy_gains'].record()

                COPY(h, nubls * chunk_size, ubls_g.gpudata, 1,
                     new_ubls_g.gpudata, 1)
                events['copy_ubls'].record()

                update_gains_cuda(gbuf_g,
                                  gwgt_g,
                                  new_gains_g,
                                  np.float32(gain),
                                  active_g,
                                  block=block,
                                  grid=ant_grid)
                events['update_gains'].record()

                update_ubls_cuda(ubuf_g,
                                 uwgt_g,
                                 new_ubls_g,
                                 np.float32(gain),
                                 active_g,
                                 block=block,
                                 grid=ubl_grid)
                events['update_ubls'].record()

                gen_dmdl_cuda(ggu_indices_g,
                              new_gains_g,
                              new_ubls_g,
                              dmdl_g,
                              active_g,
                              block=block,
                              grid=bls_grid)
                events['dmdl2'].record()

                set_val_real_cuda(new_chisq_g,
                                  np.uint32(chunk_size),
                                  real_dtype(0),
                                  block=(chunk_size, 1, 1),
                                  grid=(1, 1))
                calc_chisq_cuda(data_g,
                                dmdl_g,
                                wgts_g,
                                new_chisq_g,
                                active_g,
                                block=block,
                                grid=bls_grid)
                events['chisq2'].record()

                set_val_real_cuda(conv_sum_g,
                                  np.uint32(chunk_size),
                                  real_dtype(0),
                                  block=(chunk_size, 1, 1),
                                  grid=(1, 1))
                set_val_real_cuda(conv_wgt_g,
                                  np.uint32(chunk_size),
                                  real_dtype(0),
                                  block=(chunk_size, 1, 1),
                                  grid=(1, 1))
                calc_conv_cuda(new_gains_g,
                               gains_g,
                               new_ubls_g,
                               ubls_g,
                               conv_sum_g,
                               conv_wgt_g,
                               active_g,
                               block=block,
                               grid=conv_grid)
                events['calc_conv'].record()

                update_active_cuda(new_gains_g,
                                   gains_g,
                                   new_ubls_g,
                                   ubls_g,
                                   conv_sum_g,
                                   conv_wgt_g,
                                   conv_g,
                                   real_dtype(conv_crit),
                                   new_chisq_g,
                                   chisq_g,
                                   iters_g,
                                   np.uint32(i),
                                   active_g,
                                   block=block,
                                   grid=conv_grid)
                events['update_active'].record()

                active_g.get_async(ary=active)
                events['get_active'].record()

                if not np.any(active):
                    break
            events['end'].record()

            if TIME_IT:
                events['end'].synchronize()
                for (e1, e2) in event_pairs[4:]:
                    cum_time[(e1,e2)] = cum_time.get((e1,e2), 0) + \
                                        events[e2].time_since(events[e1])

        # Download final answers into buffers returned to user
        # use offset to trim off parts of chunk that were never active
        _chisq = np.empty((chunk_size, ), dtype=real_dtype)
        chisq_g.get_async(ary=_chisq)
        chisq[px:end] = _chisq[offset:]
        _iters = np.empty((chunk_size, ), dtype=np.uint32)
        iters_g.get_async(ary=_iters)
        iters[px:end] = _iters[offset:]
        _conv = np.empty((chunk_size, ), dtype=real_dtype)
        conv_g.get_async(ary=_conv)
        conv[px:end] = _conv[offset:]
        _gains = np.empty(ANT_SHAPE, dtype=complex_dtype)
        gains_g.get_async(ary=_gains)
        gains[px:end, :] = _gains[offset:, :]
        _ubls = np.empty(UBL_SHAPE, dtype=complex_dtype)
        ubls_g.get_async(ary=_ubls)
        ubls[px:end, :] = _ubls[offset:, :]

    t1 = time.time()
    if TIME_IT:
        print('Final, nthreads=%d' % nthreads)
        for (e1, e2) in event_pairs:
            try:
                print('%6.3f' % cum_time[(e1, e2)], e1, e2)
            except (KeyError):
                pass
        print(t1 - t0)
        print()

    cublasDestroy(h)  # teardown GPU configuration

    return {
        'gains': gains,
        'ubls': ubls,
        'chisq': chisq,
        'iters': iters,
        'conv': conv
    }
Exemplo n.º 30
0
def diag_gpu(A, v1):
    # handle
    current_handle = cublas.cublasCreate()

    m = A.shape[0]
    Q = np.zeros((m, m), dtype=np.float64)
    # Q[0, :] = 0.0 # implied
    Q[1, :] = v1.copy()
    beta = np.zeros(m, dtype=np.float64)
    alpha = np.zeros(m, dtype=np.float64) 

    # move data onto the GPU
    A_gpu = gpuarray.to_gpu(A)
    Q_gpu = gpuarray.to_gpu(Q)
    beta_gpu = gpuarray.to_gpu(beta)
    alpha_gpu = gpuarray.to_gpu(alpha)
    w = gpuarray.zeros(m, dtype=np.float64)

    # we define three kernels for simple arithmetic
    w_scale = ElementwiseKernel(
        arguments="double *w, double *alpha, double *beta, double *Q1, double *Q2, int loop_index",
        operation="w[i] = w[i] - (alpha[loop_index] * Q1[i]) - (beta[loop_index] * Q2[i])",
        name="element_wise_w_building")
    # using -= to do inplace subtraction gives an incorrect answer


    norm_krnl = ReductionKernel(np.float64, neutral="0.0", reduce_expr="a+b", 
        map_expr="x[i]*x[i]", arguments="double *x")

    ediv = ElementwiseKernel(
        arguments="double *a, double *b, double *c, int loop_index",
        operation="a[i] = b[i] / c[loop_index+1]",
        name="element_wise_division")
    # the name must not have spaces!!!!

    for i in range(1, m-1):
        cublas.cublasDgemv(handle = current_handle, trans = 'T',
                            m = m, n = m, # Hermitian matrix
                            alpha = 1.0, 
                            beta = 0.0,
                            A = A_gpu.gpudata, 
                            lda = m,
                            x = Q_gpu[i, :].gpudata, 
                            incx = 1,    
                            y = w.gpudata, 
                            incy = 1,
                            )

        cublas.cublasDgemm(handle = current_handle, 
                            transa = 'n', transb = 'n',
                            m   = 1, n  = 1,      k = m,
                            lda = 1, ldb = m, ldc = 1,
                            alpha = 1.0,  beta = 0.0, 
                            A = w.gpudata, 
                            B = Q_gpu[i, :].gpudata, 
                            C = alpha_gpu[i].gpudata)


        w_scale(w, alpha_gpu, beta_gpu, Q_gpu[i, :], Q_gpu[i-1, :], i)
        beta_gpu[i+1] = cumath.sqrt(norm_krnl(w))
        ediv(Q_gpu[i+1, :], w, beta_gpu, i)
    # end of loop

    # last 2 steps
    cublas.cublasDgemv(handle = current_handle, trans = 'T',
                            m = m, n = m, # Hermitian matrix
                            alpha = 1.0,
                            beta = 0.0,
                            A = A_gpu.gpudata, 
                            lda = m,
                            x = Q_gpu[-1, :].gpudata,
                            incx = 1,    
                            y = w.gpudata,
                            incy = 1,)

    cublas.cublasDgemm(handle = current_handle, 
                        transa = 'n', transb = 'n',
                        m   = 1, n  = 1,  k = m,
                        lda = 1, ldb = m, ldc = 1,
                        alpha = 1.0,  beta = 0.0, 
                        A = w.gpudata, 
                        B = Q_gpu[-1, :].gpudata, 
                        C = alpha_gpu[-1].gpudata)

    # retrive the alpha's and betas
    alpha_cpu = alpha_gpu.get()
    beta_cpu = beta_gpu.get()

    print("GPU: ", alpha_cpu, beta_cpu, sep="\n\n")
    # make tridiagonal matrix out of alpha and B
    # Tri = np.zeros(matrix_size)
    return
Exemplo n.º 31
0
 def setUpClass(cls):
     cls.cublas_handle = cublas.cublasCreate()
Exemplo n.º 32
0
        flag = 1

    p_hat_gpu.gpudata.free()
    s_hat_gpu.gpudata.free()
    v_gpu.gpudata.free()
    t_gpu.gpudata.free()

    return xcg_gpu, 0, 0, 0


########
# MAIN #
########
culinalg.init()
cuda.init()
cublas_handle = cublas.cublasCreate()

start = cuda.Event()
end = cuda.Event()

# --- Wave propagation
c = np.float32(3e8)

# --- Frequency definitions
fmin = np.float32(1e9)
fmax = np.float32(2.e9)
freq = np.arange(fmin, fmax + .5e9, .5e9, dtype=np.float32)

lambdamin = np.float32(c / fmax)
lambdamax = np.float32(c / fmin)
Exemplo n.º 33
0
	# transfer kinetic portion to gpu
	T_gpu   = gpuarray.to_gpu(T_i)

	# allocate space on gpu for results
	U_x_gpu = gpuarray.zeros((basis_size, basis_size*basis_size), np.float64) # an empty matrix of the right size
	U_y_gpu = gpuarray.zeros((basis_size, basis_size*basis_size), np.float64) # an empty matrix of the right size
	U_z_gpu = gpuarray.zeros((basis_size, basis_size*basis_size), np.float64) # an empty matrix of the right size

#m, k, n = ud.basis_size, ud.basis_size, ud.basis_size**2
for basis in range(BASIS_SIZE):
	prepare_gpu(basis) # set it up
	i, j_k, i_prime = basis_size, basis_size*basis_size, basis_size
	initial_time = time.clock()
	for num_iter in range(ITERATIONS):
		cublas.cublasDgemm(handle = cublas.cublasCreate(), 
							transa = 'n', transb = 'n',
							m 	= i, n 	= j_k, 		k = i_prime,
							lda = i, ldb = i_prime, ldc = i,
							alpha = ud.alpha,  beta = ud.beta, 
							A = T_gpu.gpudata, 
							B = v_x_gpu.gpudata, 
							C = U_x_gpu.gpudata, )
		cublas.cublasDgemm(handle = cublas.cublasCreate(), 
							transa = 'n', transb = 'n',
							m 	= i, n 	= j_k, 		k = i_prime,
							lda = i, ldb = i_prime, ldc = i,
							alpha = ud.alpha,  beta = ud.beta, 
							A = T_gpu.gpudata, 
							B = v_y_gpu.gpudata, 
							C = U_y_gpu.gpudata, )
Exemplo n.º 34
0
x = np.ones(N, dtype=np.float32)
y = 2. * np.ones(N, dtype=np.float32)

nStreams = 2
streams = [cuda.Stream() for i in range(nStreams)]

x_pin = [
    cuda.register_host_memory(x[i * N / nStreams:(i + 1) * N / nStreams])
    for i in range(nStreams)
]
y_pin = [
    cuda.register_host_memory(y[i * N / nStreams:(i + 1) * N / nStreams])
    for i in range(nStreams)
]

h = cublas.cublasCreate()

x_gpu = np.empty(nStreams, dtype=object)
y_gpu = np.empty(nStreams, dtype=object)
ans = np.empty(nStreams, dtype=object)

for i in range(nStreams):
    cublas.cublasSetStream(h, streams[i].handle)

    x_gpu[i] = gpuarray.to_gpu_async(x_pin[i], stream=streams[i])
    y_gpu[i] = gpuarray.to_gpu_async(y_pin[i], stream=streams[i])

    cublas.cublasSaxpy(h, x_gpu[i].size, a, x_gpu[i].gpudata, 1,
                       y_gpu[i].gpudata, 1)
    ans[i] = y_gpu[i].get_async(stream=streams[i])
Exemplo n.º 35
0
def attach_cublas_handle_to_context(ctx):
    handle = getattr(ctx, 'cublas_handle', None)
    if handle is None:
        with ctx:
            ctx.cublas_handle = cublas.cublasCreate()
Exemplo n.º 36
0
# coding=utf-8
import pycuda.autoinit
from pycuda import gpuarray
import numpy as np
from skcuda import cublas
from time import time

a = np.float32(10)
x = np.float32([1, 2, 3])
y = np.float32([-.345, 8.15, -15.867])
x_gpu = gpuarray.to_gpu(x)
y_gpu = gpuarray.to_gpu(y)

# create a cuBLAS context. This is similar in nature to CUDA contexts
cublas_context_h = cublas.cublasCreate()
"""Level-1 AXPY (vector-vector)"""

# this is a direct wrapper to a low-level C function, so the input may seem more like a C function than a true Python function.
# In short, this performed an "AXPY" operation, ultimately putting the output data into the y_gpu array
# first input is always the CUDA context handle. We then have to specify the size of the vectors, since this function will be ultimately
# operating on C pointers; we can do this by using the size parameter of a gpuarray. Having typecasted our scalar already to a NumPy float32 variable,
# we can pass the a variable right over as the scalar parameter. We then hand the underlying C pointer of the x_gpu array to this function using the gpudata
# parameter. Then we specify the stride of the first array as 1: the stride specifies how many steps we should take between each input value.
# (In contrast, if you were using a vector from a column in a row-wise matrix, you would set the stride to the width of the matrix.)
# We then put in the pointer to the y_gpu array, and set its stride to 1 as well

#We can now use the cublasSaxpy function. The S stands for single precision, which is what we will need since we are working with 32-bit floating point arrays:
cublas.cublasSaxpy(cublas_context_h, x_gpu.size, a, x_gpu.gpudata, 1,
                   y_gpu.gpudata, 1)

print(y_gpu.get())
Exemplo n.º 37
0
def cublas_handle():
    cublas_handle = cublas.cublasCreate()
    cublas.cublasSetStream(cublas_handle, stream().handle)
    return cublas_handle
Exemplo n.º 38
0
def vis_gpu(antpos,
            freq,
            eq2tops,
            crd_eq,
            I_sky,
            bm_cube,
            nthreads=NTHREADS,
            max_memory=MAX_MEMORY,
            real_dtype=np.float32,
            complex_dtype=np.complex64,
            verbose=False):
    # ensure shapes
    nant = antpos.shape[0]
    assert (antpos.shape == (nant, 3))
    npix = crd_eq.shape[1]
    assert (crd_eq.shape == (3, npix))
    assert (I_sky.shape == (npix, ))
    beam_px = bm_cube.shape[1]
    assert (bm_cube.shape == (nant, beam_px, beam_px))
    ntimes = eq2tops.shape[0]
    assert (eq2tops.shape == (ntimes, 3, 3))
    # ensure data types
    antpos = antpos.astype(real_dtype)
    eq2tops = eq2tops.astype(real_dtype)
    crd_eq = crd_eq.astype(real_dtype)
    Isqrt = np.sqrt(I_sky).astype(real_dtype)
    bm_cube = bm_cube.astype(real_dtype)  # XXX complex?
    chunk = max(min(npix, MIN_CHUNK),
                2**int(ceil(np.log2(float(nant * npix) / max_memory / 2))))
    npixc = npix / chunk
    # blocks of threads are mapped to (pixels,ants,freqs)
    block = (max(1, nthreads / nant), min(nthreads, nant), 1)
    grid = (int(ceil(npixc / float(block[0]))),
            int(ceil(nant / float(block[1]))))
    gpu_code = GPU_TEMPLATE % {
        'NANT': nant,
        'NPIX': npixc,
        'BEAM_PX': beam_px,
        'BLOCK_PX': block[0],
    }
    gpu_module = compiler.SourceModule(gpu_code)
    bm_interp = gpu_module.get_function("InterpolateBeam")
    meas_eq = gpu_module.get_function("MeasEq")
    bm_texref = gpu_module.get_texref("bm_tex")
    import pycuda.autoinit
    h = cublasCreate()  # handle for managing cublas
    # define GPU buffers and transfer initial values
    bm_texref.set_array(
        numpy3d_to_array(bm_cube)
    )  # never changes, transpose happens in copy so cuda bm_tex is (BEAM_PX,BEAM_PX,NANT)
    antpos_gpu = gpuarray.to_gpu(
        antpos)  # never changes, set to -2*pi*antpos/c
    Isqrt_gpu = gpuarray.empty(shape=(npixc, ), dtype=real_dtype)
    A_gpu = gpuarray.empty(shape=(nant, npixc),
                           dtype=real_dtype)  # will be set on GPU by bm_interp
    crd_eq_gpu = gpuarray.empty(shape=(3, npixc), dtype=real_dtype)
    eq2top_gpu = gpuarray.empty(shape=(3, 3),
                                dtype=real_dtype)  # sent from CPU each time
    crdtop_gpu = gpuarray.empty(shape=(3, npixc),
                                dtype=real_dtype)  # will be set on GPU
    tau_gpu = gpuarray.empty(shape=(nant, npixc),
                             dtype=real_dtype)  # will be set on GPU
    v_gpu = gpuarray.empty(shape=(nant, npixc),
                           dtype=complex_dtype)  # will be set on GPU
    vis_gpus = [
        gpuarray.empty(shape=(nant, nant), dtype=complex_dtype)
        for i in xrange(chunk)
    ]
    # output CPU buffers for downloading answers
    vis_cpus = [
        np.empty(shape=(nant, nant), dtype=complex_dtype)
        for i in xrange(chunk)
    ]
    streams = [driver.Stream() for i in xrange(chunk)]
    event_order = ('start', 'upload', 'eq2top', 'tau', 'interpolate',
                   'meas_eq', 'vis', 'end')
    vis = np.empty((ntimes, nant, nant), dtype=complex_dtype)
    for t in xrange(ntimes):
        if verbose: print '%d/%d' % (t + 1, ntimes)
        eq2top_gpu.set(
            eq2tops[t])  # defines sky orientation for this time step
        events = [{e: driver.Event()
                   for e in event_order} for i in xrange(chunk)]
        for c in xrange(chunk + 2):
            cc = c - 1
            ccc = c - 2
            if 0 <= ccc < chunk:
                stream = streams[ccc]
                vis_gpus[ccc].get_async(ary=vis_cpus[ccc], stream=stream)
                events[ccc]['end'].record(stream)
            if 0 <= cc < chunk:
                stream = streams[cc]
                cublasSetStream(h, stream.handle)
                ## compute crdtop = dot(eq2top,crd_eq)
                # cublas arrays are in Fortran order, so P=M*N is actually
                # peformed as P.T = N.T * M.T
                cublasSgemm(h, 'n', 'n', npixc, 3, 3, 1., crd_eq_gpu.gpudata,
                            npixc, eq2top_gpu.gpudata, 3, 0.,
                            crdtop_gpu.gpudata, npixc)
                events[cc]['eq2top'].record(stream)
                ## compute tau = dot(antpos,crdtop)
                cublasSgemm(h, 'n', 'n', npixc, nant, 3, 1.,
                            crdtop_gpu.gpudata, npixc, antpos_gpu.gpudata, 3,
                            0., tau_gpu.gpudata, npixc)
                events[cc]['tau'].record(stream)
                ## interpolate bm_tex at specified topocentric coords, store interpolation in A
                ## threads are parallelized across pixel axis
                bm_interp(crdtop_gpu,
                          A_gpu,
                          grid=grid,
                          block=block,
                          stream=stream)
                events[cc]['interpolate'].record(stream)
                # compute v = A * I * exp(1j*tau*freq)
                meas_eq(A_gpu,
                        Isqrt_gpu,
                        tau_gpu,
                        real_dtype(freq),
                        v_gpu,
                        grid=grid,
                        block=block,
                        stream=stream)
                events[cc]['meas_eq'].record(stream)
                # compute vis = dot(v, v.T)
                # transpose below incurs about 20% overhead
                cublasCgemm(h, 'c', 'n', nant, nant, npixc, 1., v_gpu.gpudata,
                            npixc, v_gpu.gpudata, npixc, 0.,
                            vis_gpus[cc].gpudata, nant)
                events[cc]['vis'].record(stream)
            if c < chunk:
                stream = streams[c]
                events[c]['start'].record(stream)
                crd_eq_gpu.set_async(crd_eq[:, c * npixc:(c + 1) * npixc],
                                     stream=stream)
                Isqrt_gpu.set_async(Isqrt[c * npixc:(c + 1) * npixc],
                                    stream=stream)
                events[c]['upload'].record(stream)
        events[chunk - 1]['end'].synchronize()
        vis[t] = sum(vis_cpus)
        if verbose:
            for c in xrange(chunk):
                print '%d:%d START->END:' % (
                    c, chunk), events[c]['start'].time_till(
                        events[c]['end']) * 1e-3
                #for i,e in enumerate(event_order[:-1]):
                #    print c, e,'->',event_order[i+1], ':', events[c][e].time_till(events[c][event_order[i+1]]) * 1e-3
            print 'TOTAL:', events[0]['start'].time_till(
                events[chunk - 1]['end']) * 1e-3
    # teardown GPU configuration
    cublasDestroy(h)
    return vis
Exemplo n.º 39
0
 def setUp(self):
     np.random.seed(23)    # For reproducible tests.
     self.cublas_handle = cublas.cublasCreate()
Exemplo n.º 40
0
c_cpu = np.dot(a_cpu, b_cpu)

# printing the process (CPU):
print("\nComputing matrix vector mutliplication in CPU:\n")
print("a = \n", a_cpu, "\n")
print("b = \n", b_cpu, "\n")
print("Product =\n", c_cpu, "\n")

# allocating and converting (a) to a gpuarray:
a_gpu = gpuarray.to_gpu(a_cpu)
# allocating and converting (b) to a gpuarray:
b_gpu = gpuarray.to_gpu(b_cpu)
# allocating c as a 2x1 matrix filled with zeros:
c_gpu = gpuarray.zeros((2,1), dtype = np.float32)
# Computing matrix product of gpu (a) and (b) and storing it in gpu (c):
cublas.cublasSgemv(handle = cublas.cublasCreate(),
				trans = 'n',
				m = 2, n = 3,
				alpha = 1.0,
				A = a_gpu.gpudata, lda = 2,
				x = b_gpu.gpudata, incx = 1, 
				beta = 1.0, y = c_gpu.gpudata, incy = 1)

# printing the process(GPU):
print("\nComputing matrix vector mutliplication in GPU:\n")
print("a = \n", a_gpu.get(), "\n")
print("b = \n", b_gpu.get(), "\n")
print("Product =\n", c_gpu.get(), "\n")