def calculate_z_numpy_gpu(q, maxiter, z): """Calculate z using numpy on the GPU via gpuarray""" outputg = gpuarray.to_gpu( nm.resize(nm.array(0, ), q.shape).astype(nm.int32)) zg = gpuarray.to_gpu(z.astype(nm.complex64)) qg = gpuarray.to_gpu(q.astype(nm.complex64)) # 2.0 as an array twosg = gpuarray.to_gpu(nm.array([2.0] * zg.size).astype(nm.float32)) # 0+0j as an array cmplx0sg = gpuarray.to_gpu( nm.array([0 + 0j] * zg.size).astype(nm.complex64)) # for abs_zg > twosg result comparison_result = gpuarray.to_gpu( nm.array([False] * zg.size).astype(nm.bool)) # we'll add 1 to iterg after each iteration iterg = gpuarray.to_gpu(nm.array([0] * zg.size).astype(nm.int32)) for iter in range(maxiter): zg = zg * zg + qg # abs returns a complex (rather than a float) from the complex # input where the real component is the absolute value (which # looks like a bug) so I take the .real after abs() abs_zg = abs(zg).real comparison_result = abs_zg > twosg qg = gpuarray.if_positive(comparison_result, cmplx0sg, qg) zg = gpuarray.if_positive(comparison_result, cmplx0sg, zg) outputg = gpuarray.if_positive(comparison_result, iterg, outputg) iterg = iterg + 1 output = outputg.get() return output
def calculate_z_numpy_gpu(q, maxiter, z): """Calculate z using numpy on the GPU""" outputg = gpuarray.to_gpu(nm.resize(nm.array(0,), q.shape)) zg = gpuarray.to_gpu(z.astype(nm.complex64)) qg = gpuarray.to_gpu(q.astype(nm.complex64)) # 2.0 as an array twosg = gpuarray.to_gpu(nm.array([2.0]*zg.size).astype(numpy.float32)) # 0+0j as an array cmplx0sg = gpuarray.to_gpu(nm.array([0+0j]*zg.size).astype(nm.complex64)) # for abs_zg > twosg result comparison_result = gpuarray.to_gpu(nm.array([False]*zg.size).astype(nm.bool)) # we'll add 1 to iterg after each iteration iterg = gpuarray.to_gpu(nm.array([0]*zg.size).astype(nm.int32)) for iter in range(maxiter): zg = zg*zg + qg # abs returns a complex (rather than a float) from the complex # input where the real component is the absolute value (which # looks like a bug) so I take the .real after abs() abs_zg = abs(zg).real comparison_result = abs_zg > twosg qg = gpuarray.if_positive(comparison_result, cmplx0sg, qg) zg = gpuarray.if_positive(comparison_result, cmplx0sg, zg) outputg = gpuarray.if_positive(comparison_result, iterg, outputg) iterg = iterg + 1 output = outputg.get() return output
def calculate_z_asnumpy_gpu(q, maxiter, z): """Calculate z using numpy on the GPU""" # convert complex128s (2*float64) to complex64 (2*float32) so they run # on older CUDA cards like the one in my MacBook. To use float64 doubles # just edit these two lines complex_type = np.complex64 # or nm.complex128 on newer CUDA devices float_type = np.float32 # or nm.float64 on newer CUDA devices # create an output array on the gpu of int32 as one long vector outputg = gpuarray.to_gpu(np.resize(np.array(0, ), q.shape)) # resize our z and g as necessary to longer or shorter float types z = z.astype(complex_type) q = q.astype(complex_type) # create zg and qg on the gpu zg = gpuarray.to_gpu(z) qg = gpuarray.to_gpu(q) # create 2.0 as an array twosg = gpuarray.to_gpu(np.array([2.0] * zg.size).astype(float_type)) # create 0+0j as an array cmplx0sg = gpuarray.to_gpu( np.array([0 + 0j] * zg.size).astype(complex_type)) # create a bool array to hold the (for abs_zg > twosg) result later comparison_result = gpuarray.to_gpu( np.array([False] * zg.size).astype(np.bool)) # we'll add 1 to iterg after each iteration, create an array to hold the iteration count iterg = gpuarray.to_gpu(np.array([0] * zg.size).astype(np.int32)) for iter in range(maxiter): # multiply z on the gpu by itself, add q (on the gpu) zg = zg * zg + qg # abs returns a complex (rather than a float) from the complex # input where the real component is the absolute value (which # looks like a bug) so I take the .real after abs() # the above bug relates to pyCUDA from mid2010, it might be fixed now... abs_zg = abs(zg).real # figure out if zg is > 2 comparison_result = abs_zg > twosg # based on the result either take 0+0j for qg and zg or leave unchanged qg = gpuarray.if_positive(comparison_result, cmplx0sg, qg) zg = gpuarray.if_positive(comparison_result, cmplx0sg, zg) # if the comparison is true then update the iterations count to outputg # which we'll extract later outputg = gpuarray.if_positive(comparison_result, iterg, outputg) # increment the iteration counter iterg = iterg + 1 # extract the result from the gpu back to the cpu output = outputg.get() return output
def sqrt_normalize_gpu(img): global posr, negr, posa, nega, stream rgb = gpuarray.to_gpu(img[:, :, :3].copy()) a = gpuarray.to_gpu(img[:, :, 3].copy()) if not posr: posr = gpuarray.zeros_like(rgb) + 1 negr = gpuarray.zeros_like(rgb) - 1 posa = gpuarray.zeros_like(a) + 1 nega = gpuarray.zeros_like(a) - 1 rgb = cumath.sqrt(abs(rgb), stream=stream) * gpuarray.if_positive( rgb, posr, negr, stream=stream) a = cumath.sqrt(abs(a), stream=stream) * gpuarray.if_positive( a, posa, nega, stream=stream) return normalize_gpu(rgb, a)
def linear_corr_cuda(self, image, l): N = image.shape[0] nd = self.pairwise_difference(image, N) C = (1 - (nd / l)) zeros = misc.zeros(C.shape, C.dtype) C = gpuarray.if_positive(C, C, zeros) return C.copy()
def step_1(matrix_color, matrix_suma): #La función gpuarray.if_positive evalua cada posición de la matriz #Y de acuerdo a su valor realiza la primer operación o la segunda constatando una sentencia If Else matrix_1 = gpuarray.if_positive(matrix_suma, (3 * matrix_color) / matrix_suma, matrix_suma) return matrix_1
def calculate_z_asnumpy_gpu(q, maxiter, z): """Calculate z using numpy on the GPU""" # convert complex128s (2*float64) to complex64 (2*float32) so they run # on older CUDA cards like the one in my MacBook. To use float64 doubles # just edit these two lines complex_type = np.complex64 # or nm.complex128 on newer CUDA devices float_type = np.float32 # or nm.float64 on newer CUDA devices # create an output array on the gpu of int32 as one long vector outputg = gpuarray.to_gpu(np.resize(np.array(0), q.shape)) # resize our z and g as necessary to longer or shorter float types z = z.astype(complex_type) q = q.astype(complex_type) # create zg and qg on the gpu zg = gpuarray.to_gpu(z) qg = gpuarray.to_gpu(q) # create 2.0 as an array twosg = gpuarray.to_gpu(np.array([2.0] * zg.size).astype(float_type)) # create 0+0j as an array cmplx0sg = gpuarray.to_gpu(np.array([0 + 0j] * zg.size).astype(complex_type)) # create a bool array to hold the (for abs_zg > twosg) result later comparison_result = gpuarray.to_gpu(np.array([False] * zg.size).astype(np.bool)) # we'll add 1 to iterg after each iteration, create an array to hold the iteration count iterg = gpuarray.to_gpu(np.array([0] * zg.size).astype(np.int32)) for iter in range(maxiter): # multiply z on the gpu by itself, add q (on the gpu) zg = zg * zg + qg # abs returns a complex (rather than a float) from the complex # input where the real component is the absolute value (which # looks like a bug) so I take the .real after abs() # the above bug relates to pyCUDA from mid2010, it might be fixed now... abs_zg = abs(zg).real # figure out if zg is > 2 comparison_result = abs_zg > twosg # based on the result either take 0+0j for qg and zg or leave unchanged qg = gpuarray.if_positive(comparison_result, cmplx0sg, qg) zg = gpuarray.if_positive(comparison_result, cmplx0sg, zg) # if the comparison is true then update the iterations count to outputg # which we'll extract later outputg = gpuarray.if_positive(comparison_result, iterg, outputg) # increment the iteration counter iterg = iterg + 1 # extract the result from the gpu back to the cpu output = outputg.get() return output
def make_sample_data(set_: int): np.random.seed(set_ * 4347) if set_ == 1: # Uniform distribution data = np.random.uniform(0, 1, size=(samples, num_features)) if set_ == 2: # 3 Gaussian distribution data = multi_gauss_clusters(n_clusters=3) if set_ == 3: # 10 Gaussian distribution data = multi_gauss_clusters(n_clusters=10) df = pd.DataFrame() np.random.shuffle(data) df['vec'] = data.tolist() # find nearest neighbours from sklearn.neighbors import NearestNeighbors nbrs = NearestNeighbors(n_neighbors=51, algorithm='ball_tree', leaf_size=30).fit(data) _, nbrs_indices = nbrs.kneighbors(data) for n_nbr in range(10, 51, 5): df[f"known_neighbours_{n_nbr}"] = [ x[1:(n_nbr + 1)] for x in nbrs_indices ] # hash using random hyperplane LSH import pycuda.gpuarray as gpuarray import skcuda.linalg as linalg import pycuda.autoinit linalg.init() os.environ['CUDA_HOME'] = "/opt/cuda/" vec_np = np.array(df['vec'].values.tolist(), dtype=np.float32) LSH = LSHBias(feature_dim=num_features, bits=LSH_NUM_BITS) W = np.array(LSH.W, dtype=np.float32) b_gpu = gpuarray.to_gpu(W) ones = np.ones(shape=(vec_np.shape[0], 1), dtype=np.float32) X = np.concatenate((vec_np, ones), axis=1) # do the matrix multiplication a_gpu = gpuarray.to_gpu(X) mul = linalg.mdot(a_gpu, b_gpu) # get binary: 1 if value >= 0, else 0 res = gpuarray.if_positive( mul >= gpuarray.zeros(mul.shape, dtype=np.float32), then_=gpuarray.ones_like(mul), else_=gpuarray.zeros_like(mul)) res = np.array(res.get(), dtype=np.uint32) # convert grouped bits to integers res = np_array_binary_to_grouped_integers(res) df[f"hash_{LSH_NUM_BITS}_bits"] = [x for x in res] df.to_parquet(f"{config.CUDA_neighbour_search_df_dir}df-{set_}.parquet", index=False) print("created test-data")
def wsparsify(w_gpu, percentage): """ Keeps only as many entries nonzero as specified by percentage. """ w = w_gpu.get() vals = sort(w)[::-1] idx = floor(prod(w.shape()) * percentage/100) zw_gpu = cua.zeros_like(w_gpu) # gpu array filled with zeros tw_gpu = cua.empty_like(w_gpu) # gpu array containing threshold tw_gpu.fill(vals[idx]) w_gpu = cua.if_positive(w_gpu > tw_gpu, w_gpu, zw_gpu) del zw_gpu del tw_gpu return w_gpu
def run_gpu(self, Niters): """ Run G-S on GPU. The result is overwritten on the attribute "self.wdata" containing a pycuda array. """ Nz, Ny, Nx = self.shape # Allocate output data wdata = gpuarray.empty((Ny, Nx), np.complex64) sim = gpuarray.empty((Nz, Ny, Nx), np.complex64) Isim = gpuarray.empty((Nz, Ny, Nx), np.complex64) for io in trange(Niters): # Propagate the initial wave to simulate defocused waves # Psi(x,y,z) = convolve[Psi(x,y,0), CTF(x,y,z)] cu_fft.fft(self.wdata, wdata, self.pft2dcc) for kk in range(Nz): sim[kk, :, :] = self.ctfd[kk, :, :] * wdata cu_fft.ifft(sim, sim, self.pft3dcc, True) if hasattr(self, 'Esdata'): # Use the intensities, Isim = |Psi|**2 # Convolve with spatial-coherence envelope # Isim = convolve[Isim, Es] Isim = sim * sim.conj() cu_fft.fft(Isim, Isim, self.pft3dcc) cu_fft.ifft(Isim * self.Esdata, Isim, self.pft3dcc, True) # Combine experimental and simulated amplitudes with simulated phase # Psi' = [abs(Psi)+sqrt(Iexp)-sqrt(Isim)]*exp[i*arg(Psi)] self.cuwave(self.Iexp, sim, Isim.real, Isim) else: # Combine experimental amplitudes with simulated phase # Psi' = [sqrt(Iexp)]*exp[i*arg(Psi)] self.cuwave(self.Iexp, sim, Isim) sim = gpuarray.if_positive(self.mask, Isim, sim) # then back-propagate to the exit plane and take average # Psi(x,y,0) = < convolve[Psi, CTF*] >_z cu_fft.fft(sim, sim, self.pft3dcc) sim = sim * self.ctfd.conj() cu_fft.ifft(sim, sim, self.pft3dcc, True) wdata = misc.mean(sim.reshape(Nz, Nx * Ny), 0).reshape(Ny, Nx) # update phase and wave self.cuphase(wdata, self.wdata, self.phase_data) self.wdata = wdata.copy()
def isgreater_gpu(x_gpu, y_gpu): """ Computes if x_gpu > y_gpu and gives back a mask with 0s and 1s. Note, that y_gpu can be a scalar value as well. """ if ((y_gpu.__class__ == np.float) or (y_gpu.__class__ == np.float32) or (y_gpu.__class__ == np.float64)): val = np.float32(y_gpu) y_gpu = cua.empty_like(x_gpu) # gpu array containing threshold y_gpu.fill(val) zeros_gpu = cua.zeros_like(x_gpu) # gpu array filled with zeros ones_gpu = cua.empty_like(x_gpu) # gpu array containing threshold ones_gpu.fill(np.float32(1.)) mask_gpu = cua.if_positive(x_gpu > y_gpu, ones_gpu, zeros_gpu) del zeros_gpu del ones_gpu return mask_gpu
# Read image. BW images have R=G=B so extract the R-value image = img.imread(in_file_name)[:,:,0] height, width = np.int32(image.shape) area = height*width print "Processing %d x %d image" % (width, height) blocksize = (32,32,1) gridsize = (int(width/32),int(height/32)) ones = np.empty([height,width]) ones[:,:] = 1 im_d = gpu.to_gpu(np.float32(np.array(image))) ones_d = gpu.to_gpu(np.int32(np.array(ones))) zero_d = gpu.to_gpu(np.int32(np.zeros([height,width]))) threshold_d = gpu.if_positive(im_d - seed_threshold, zero_d, ones_d) new_d = gpu.to_gpu(np.int32(np.zeros([height,width]))) old_flags = 0 new_flags = gpu.sum(threshold_d).get() while (new_flags - old_flags) != 0: old_flags = new_flags # run filter kernel filter_kernel(threshold_d, im_d, new_d, width, height, threshold, block=blocksize, grid=gridsize) new_flags = gpu.sum(new_d).get() # transfer output to input threshold_d = new_d
# hashing different .orc DataFrames for filename in tqdm(glob(basepath + "part-*.orc")): df = pd.read_orc(filename) df = df.rename(columns={"FeatureVector_all_features": "vec"}) count += 1 vec_np = np.array(df['vec'].values.tolist(), dtype=np.float32) # add bias term ones = np.ones(shape=(vec_np.shape[0], 1), dtype=np.float32) X = np.concatenate((vec_np, ones), axis=1) # do the matrix multiplication a_gpu = gpuarray.to_gpu(X) mul = linalg.mdot(a_gpu, b_gpu) # get binary: 1 if value >= 0, else 0 res = gpuarray.if_positive( mul >= gpuarray.zeros(mul.shape, dtype=np.float32), then_=gpuarray.ones_like(mul), else_=gpuarray.zeros_like(mul)) res = np.array(res.get(), dtype=np.uint32) # convert grouped bits to integers res = np_array_binary_to_grouped_integers(res) df[f"hash_{LSH_NUM_BITS}_bits"] = [x for x in res] df = df[["rec_MBID", f"hash_{LSH_NUM_BITS}_bits"]] df.to_parquet(f"{config.ABz_GPU_hashed_output_dir}{count}.parquet", index=False) # save as a single parquet file spark = SparkSession \ .builder \ .appName("hashed file coalesce") \
H = 240 cap.set(cv.CAP_PROP_FRAME_HEIGHT, H) cap.set(cv.CAP_PROP_FRAME_WIDTH, W) ret, frame = cap.read() gray_a = cv.cvtColor(frame, cv.COLOR_RGB2GRAY) img_ori_gpu = gpuarray.to_gpu(gray_a.astype(np.float32)) img_buf_gpu = gpuarray.empty_like(img_ori_gpu) img_sub = gpuarray.ones_like(img_ori_gpu) img_sub = 25 * img_sub img_bgm = gpuarray.zeros_like(img_sub) while True: ret, frame = cap.read() gray_buff = cv.cvtColor(frame, cv.COLOR_BGR2GRAY) img_res_gpu = gpuarray.to_gpu(gray_buff.astype(np.float32)) img_buf_gpu = cmath.fabs(img_ori_gpu - img_res_gpu) img_buf_gpu = img_buf_gpu - img_sub img_ori_gpu = img_res_gpu.copy() img_res_gpu = gpuarray.if_positive(img_buf_gpu, img_bgm, img_res_gpu) gray_buff = img_res_gpu.get() gray_buff = gray_buff.astype(np.uint8) frame = cv.cvtColor(gray_buff, cv.COLOR_GRAY2RGB) cv.imshow("Moving Detecting!", frame) if cv.waitKey(1) & 0xFF == ord('q'): break cap.release() cv.destroyAllWindows()
def map_if_positive(self, expr): crit = self.rec(expr.criterion) then = self.rec(expr.then) else_ = self.rec(expr.else_) return gpuarray.if_positive(crit, then, else_)
# `python Homework5_script.py [number of iteration for MCMC]` ############### import pycuda.driver as cuda import pycuda.autoinit from pycuda.compiler import SourceModule import numpy as np import pycuda.gpuarray as gpuarray from pycuda import curandom import sys N = int(sys.argv[1]) #int(1e3) print("Number of iterations: " + str(N)) np.random.seed(123) draws = np.random.uniform(-1, 1, N) index = np.linspace(-1, 1, N) a_gpu = cuda.mem_alloc(draws.nbytes) cuda.memcpy_htod(a_gpu, draws) gen = pycuda.curandom.XORWOWRandomNumberGenerator() xy = gen.gen_uniform((2, N), np.float32) xy = sum(xy**2)**0.5 M = gpuarray.sum(gpuarray.if_positive(xy - 1, xy * 0, xy * 0 + 1)) pi = 4 * M / N print("The estimated value of pi is: " + str(pi))
print('a:\n{0}\nshape={1}\n'.format(a.get(), a.shape)) stream = drv.Stream() b = gpuarray.to_gpu_async(h_array, stream=stream) print('b:\n{0}\nshape={1}\n'.format(b.get(), b.shape)) c = gpuarray.empty((100, 100), dtype=dtype) print('c:\n{0}\nshape={1}\n'.format(c, c.shape)) d = gpuarray.zeros((100, 100), dtype=dtype) print('d:\n{0}\nshape={1}\n'.format(d, d.shape)) e = gpuarray.arange(0.0, 100.0, 1.0, dtype=dtype) print('e:\n{0}\nshape={1}\n'.format(e, e.shape)) f = gpuarray.if_positive(e < 50, e - 100, e + 100) print('f:\n{0}\nshape={1}\n'.format(f, f.shape)) g = gpuarray.if_positive(e < 50, gpuarray.ones_like(e), gpuarray.zeros_like(e)) print('g:\n{0}\nshape={1}\n'.format(g, g.shape)) h = gpuarray.maximum(e, f) print('h:\n{0}\nshape={1}\n'.format(h, h.shape)) i = gpuarray.minimum(e, f) print('i:\n{0}\nshape={1}\n'.format(i, i.shape)) g = gpuarray.sum(a) print(g, type(g)) k = gpuarray.max(a)