def cnvinv_objfun(self, z, sz, y_gpu, alpha=0., beta=0.): """ Computes objective function value of 'lbfgsb' mode of deconv method. See deconv for details. """ if z.__class__ == np.ndarray: z = np.array(np.reshape(z,sz)).astype(np.float32) z_gpu = cua.to_gpu(z) self.res_gpu = y_gpu - self.cnv(z_gpu) obj = 0.5*(cua.dot(self.res_gpu,self.res_gpu,dtype=np.float64)) # Thikonov regularization, dinstinguish between 'X' and 'F' cases # as size of corresponding z is different # alpha > 0: Thikonov on the gradient of z if alpha > 0: if self.__id__ == 'X': self.lz_gpu = shock.laplace_stack_gpu(z_gpu, mode='same') elif self.__id__ == 'F': self.lz_gpu = gputools.laplace_gpu(z_gpu, mode='same') obj += 0.5*alpha*(cua.dot(z_gpu, self.lz_gpu, dtype=np.float64)) # beta > 0: Thikonov on z if beta > 0: obj += 0.5*beta*(cua.dot(z_gpu, z_gpu,dtype=np.float64)) return obj.get()
def Average_Alpha1( self, Psi1_GPU, Psi2_GPU, Psi3_GPU, Psi4_GPU): average = gpuarray.dot( Psi2_GPU, Psi3_GPU.conj() ).get().real average += gpuarray.dot( Psi1_GPU, Psi4_GPU.conj() ).get().real average *= 2.*self.dX*self.dY return average
def test_dot_allocator(self): from pytest import skip skip("https://github.com/inducer/pycuda/issues/163") import pycuda.tools pool = pycuda.tools.DeviceMemoryPool() a_cpu = np.random.randint(low=512,high=1024,size=1024) b_cpu = np.random.randint(low=512,high=1024,size=1024) # Compute the result on the CPU dot_cpu_1 = np.dot(a_cpu, b_cpu) a_gpu = gpuarray.to_gpu(a_cpu) b_gpu = gpuarray.to_gpu(b_cpu) # Compute the result on the GPU using different allocators dot_gpu_1 = gpuarray.dot(a_gpu, b_gpu) dot_gpu_2 = gpuarray.dot(a_gpu, b_gpu, allocator=pool.allocate) # Test that we get the correct results assert dot_cpu_1 == dot_gpu_1.get() assert dot_cpu_1 == dot_gpu_2.get() # Test that result arrays were allocated with the appropriate allocator assert dot_gpu_1.allocator == a_gpu.allocator assert dot_gpu_2.allocator == pool.allocate
def cnvinv_objfun(self, z, sz, y_gpu, alpha=0., beta=0.): """ Computes objective function value of 'lbfgsb' mode of deconv method. See deconv for details. """ if z.__class__ == np.ndarray: z = np.array(np.reshape(z, sz)).astype(np.float32) z_gpu = cua.to_gpu(z) self.res_gpu = y_gpu - self.cnv(z_gpu) obj = 0.5 * (cua.dot(self.res_gpu, self.res_gpu).astype(np.float64)) # Thikonov regularization, dinstinguish between 'X' and 'F' cases # as size of corresponding z is different # alpha > 0: Thikonov on the gradient of z if alpha > 0: if self.__id__ == 'X': self.lz_gpu = shock.laplace_stack_gpu(z_gpu, mode='same') elif self.__id__ == 'F': self.lz_gpu = gputools.laplace_gpu(z_gpu, mode='same') obj += 0.5 * alpha * (cua.dot(z_gpu, self.lz_gpu).astype( np.float64)) # beta > 0: Thikonov on z if beta > 0: obj += 0.5 * beta * (cua.dot(z_gpu, z_gpu).astype(np.float64)) return obj.get()
def Average_Alpha2( self, Psi1_GPU, Psi2_GPU, Psi3_GPU, Psi4_GPU): average = gpuarray.dot( Psi3_GPU, Psi2_GPU.conj() ).get().imag average += gpuarray.dot( Psi1_GPU, Psi4_GPU.conj() ).get().imag average *= -2.*self.dX*self.dY return average
def test_dot_allocator(self): # FIXME from pytest import skip skip("https://github.com/inducer/pycuda/issues/163") import pycuda.tools pool = pycuda.tools.DeviceMemoryPool() a_cpu = np.random.randint(low=512, high=1024, size=1024) b_cpu = np.random.randint(low=512, high=1024, size=1024) # Compute the result on the CPU dot_cpu_1 = np.dot(a_cpu, b_cpu) a_gpu = gpuarray.to_gpu(a_cpu) b_gpu = gpuarray.to_gpu(b_cpu) # Compute the result on the GPU using different allocators dot_gpu_1 = gpuarray.dot(a_gpu, b_gpu) dot_gpu_2 = gpuarray.dot(a_gpu, b_gpu, allocator=pool.allocate) # Test that we get the correct results assert dot_cpu_1 == dot_gpu_1.get() assert dot_cpu_1 == dot_gpu_2.get() # Test that result arrays were allocated with the appropriate allocator assert dot_gpu_1.allocator == a_gpu.allocator assert dot_gpu_2.allocator == pool.allocate
def Average_Beta( self, Psi1_GPU, Psi2_GPU, Psi3_GPU, Psi4_GPU): average = gpuarray.dot(Psi1_GPU,Psi1_GPU.conj()).get() average += gpuarray.dot(Psi2_GPU,Psi2_GPU.conj()).get() average += - gpuarray.dot(Psi3_GPU,Psi3_GPU.conj()).get() average += - gpuarray.dot(Psi4_GPU,Psi4_GPU.conj()).get() average *= self.dX*self.dY return average
def Average_Alpha2( self, Psi1_GPU, Psi2_GPU, Psi3_GPU, Psi4_GPU): average = - gpuarray.dot(Psi4_GPU,Psi1_GPU.conj()).get() average += gpuarray.dot(Psi3_GPU,Psi2_GPU.conj()).get() average += - gpuarray.dot(Psi2_GPU,Psi3_GPU.conj()).get() average += gpuarray.dot(Psi1_GPU,Psi4_GPU.conj()).get() average *= 1j*self.dX*self.dY*self.dZ return average
def check_termination(self): """ Check various termination criteria """ # First check if we are doing termination based on running time if (self.options.time_limit): self.time = time.clock - self.time_start if (self.time >= self.options.maxtime): self.term_reason = 'Exceeded time limit' return # Now check if we are doing break by tolx if (self.options.use_tolx): if (np.sqrt(cua.dot(self.dx, self.dx).get()) / np.sqrt(cua.dot(self.oldx, self.oldx).get()) < self.options.tolx): self.term_reason = 'Relative change in x small enough' return # Are we doing break by tolo (tol obj val) if (self.options.use_tolo and self.iter > 2): delta = abs(self.obj - self.oldobj) if (delta < self.options.tolo): self.term_reason = 'Relative change in objvalue small enough' return # Check if change in x and gradient are small enough # we don't want that for now # if (np.sqrt((cua.dot(self.dx,self.dx).get())) < self.options.tolx) \ # or (np.sqrt(cua.dot(self.dg,self.dg).get()) < self.options.tolg): # self.term_reason = '|x_t+1 - x_t|=0 or |grad_t+1 - grad_t| < 1e-9' # return # Finally the plain old check if max iter has been achieved if (self.iter >= self.options.maxiter): self.term_reason = 'Maximum number of iterations reached' return # KKT violation if (self.options.use_kkt): if np.abs(np.sqrt(cua.dot(self.x, self.grad).get())) <= options.tolk: self.term_reason = '|x^T * grad| < opt.pbb_gradient_norm' return # Gradient check if (self.options.use_tolg): nr = cua.max(cua.fabs(self.grad)).get() if (nr < self.options.tolg): self.term_reason = '|| grad ||_inf < opt.tolg' return # No condition met, so return false self.term_reason = 0
def Average_Beta( self, Psi1_GPU, Psi2_GPU, Psi3_GPU, Psi4_GPU): average = gpuarray.dot( Psi1_GPU, Psi1_GPU.conj() ).get() average += gpuarray.dot( Psi2_GPU, Psi2_GPU.conj() ).get() average -= gpuarray.dot( Psi3_GPU, Psi3_GPU.conj() ).get() average -= gpuarray.dot( Psi4_GPU, Psi4_GPU.conj() ).get() average *= self.dX*self.dY return average
def Average_Py( self, Psi1_GPU, Psi2_GPU, Psi3_GPU, Psi4_GPU): average = gpuarray.dot(Psi1_GPU.__abs__()**2,self.Py_GPU).get() average += gpuarray.dot(Psi2_GPU.__abs__()**2,self.Py_GPU).get() average += gpuarray.dot(Psi3_GPU.__abs__()**2,self.Py_GPU).get() average += gpuarray.dot(Psi4_GPU.__abs__()**2,self.Py_GPU).get() average *= self.dX*self.dY return average
def Average_Y( self, Psi1_GPU, Psi2_GPU, Psi3_GPU, Psi4_GPU): average = gpuarray.dot(Psi1_GPU.__abs__()**2,self.Y_GPU).get() average += gpuarray.dot(Psi2_GPU.__abs__()**2,self.Y_GPU).get() average += gpuarray.dot(Psi3_GPU.__abs__()**2,self.Y_GPU).get() average += gpuarray.dot(Psi4_GPU.__abs__()**2,self.Y_GPU).get() average *= self.dX*self.dY return average
def Average_Py( self, Psi1_GPU, Psi2_GPU, Psi3_GPU, Psi4_GPU): average = gpuarray.dot(Psi1_GPU.__abs__()**2,self.Py_GPU).get() average += gpuarray.dot(Psi2_GPU.__abs__()**2,self.Py_GPU).get() average += gpuarray.dot(Psi3_GPU.__abs__()**2,self.Py_GPU).get() average += gpuarray.dot(Psi4_GPU.__abs__()**2,self.Py_GPU).get() average *= self.dPx*self.dPy return average
def _Average_Px( self, Psi1_GPU, Psi2_GPU, Psi3_GPU, Psi4_GPU): average = gpuarray.dot(Psi1_GPU.__abs__()**2,self.Px_GPU).get() average += gpuarray.dot(Psi2_GPU.__abs__()**2,self.Px_GPU).get() average += gpuarray.dot(Psi3_GPU.__abs__()**2,self.Px_GPU).get() average += gpuarray.dot(Psi4_GPU.__abs__()**2,self.Px_GPU).get() average *= self.dX*self.dY*self.dZ return average
def compute_obj(self, w_gpu): self.dfs_gpu = 1. * (self.weight(w_gpu) - self.data_gpu) res = 0.5 * self.lamda * cua.dot(self.dfs_gpu, self.dfs_gpu) reg = ( 0.5 * self.beta * cua.dot(w_gpu - self.u_gpu, w_gpu - self.u_gpu)) if self.eta: reg += 0.5 * self.eta * cua.dot(w_gpu, laplace3d_gpu(w_gpu)) return res + reg
def check_termination(self): """ Check various termination criteria """ # First check if we are doing termination based on running time if (self.options.time_limit): self.time = time.clock - self.time_start if (self.time >= self.options.maxtime): self.term_reason = 'Exceeded time limit' return # Now check if we are doing break by tolx if (self.options.use_tolx): if (np.sqrt(cua.dot(self.dx,self.dx).get())/ np.sqrt(cua.dot(self.oldx,self.oldx).get()) < self.options.tolx): self.term_reason = 'Relative change in x small enough' return # Are we doing break by tolo (tol obj val) if (self.options.use_tolo and self.iter > 2): delta = abs(self.obj-self.oldobj) if (delta < self.options.tolo): self.term_reason ='Relative change in objvalue small enough' return # Check if change in x and gradient are small enough # we don't want that for now # if (np.sqrt((cua.dot(self.dx,self.dx).get())) < self.options.tolx) \ # or (np.sqrt(cua.dot(self.dg,self.dg).get()) < self.options.tolg): # self.term_reason = '|x_t+1 - x_t|=0 or |grad_t+1 - grad_t| < 1e-9' # return # Finally the plain old check if max iter has been achieved if (self.iter >= self.options.maxiter): self.term_reason = 'Maximum number of iterations reached' return # KKT violation if (self.options.use_kkt): if np.abs(np.sqrt(cua.dot(self.x,self.grad).get())) <= options.tolk: self.term_reason = '|x^T * grad| < opt.pbb_gradient_norm' return # Gradient check if (self.options.use_tolg): nr = cua.max(cua.fabs(self.grad)).get(); if (nr < self.options.tolg): self.term_reason = '|| grad ||_inf < opt.tolg' return # No condition met, so return false self.term_reason = 0;
def one_iteration(self, compute_real_residual=False): # typed up from J.R. Shewchuk, # An Introduction to the Conjugate Gradient Method # Without the Agonizing Pain, Edition 1 1/4 [8/1994] # Appendix B3 q = self.operator(self.d) myip = gpuarray.dot(self.d, q) alpha = self.guarded_div(self.delta, myip) self.lc2(1, self.x, alpha, self.d, out=self.x) if compute_real_residual: self.residual = self.lc2( 1, self.rhs, -1, self.operator(self.x)) else: self.lc2(1, self.residual, -alpha, q, out=self.residual) s = self.precon(self.residual) delta_old = self.delta delta = AsyncInnerProduct(self.residual, s, self.pagelocked_allocator) self.delta = delta.gpu_result beta = self.guarded_div(self.delta, delta_old) self.lc2(1, s, beta, self.d, out=self.d) if compute_real_residual: self.real_delta_queue.append(delta)
def __init__(self, a, b, pagelocked_allocator): self.gpu_result = gpuarray.dot(a, b) self.gpu_finished_evt = drv.Event() self.gpu_finished_evt.record() self.gpu_finished = False self.pagelocked_allocator = pagelocked_allocator
def gpuErrorEvaluate(actual, expected): context = make_default_context() device = context.get_device() p=gpuarray.to_gpu(numpy.array(actual))- gpuarray.to_gpu(numpy.array(expected)) res= 1.0 - gpuarray.dot(p,p) context.pop() return res
def test_dot(self): from pycuda.curandom import rand as curand for sz in [ 2, 3, 4, 5, 6, 7, 31, 32, 33, 127, 128, 129, 255, 256, 257, 16384 - 993, 20000, ]: a_gpu = curand((sz, )) a = a_gpu.get() b_gpu = curand((sz, )) b = b_gpu.get() dot_ab = np.dot(a, b) dot_ab_gpu = gpuarray.dot(a_gpu, b_gpu).get() assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4
def one_iteration(self, compute_real_residual=False): # typed up from J.R. Shewchuk, # An Introduction to the Conjugate Gradient Method # Without the Agonizing Pain, Edition 1 1/4 [8/1994] # Appendix B3 q = self.operator(self.d) myip = gpuarray.dot(self.d, q) alpha = self.guarded_div(self.delta, myip) self.lc2(1, self.x, alpha, self.d, out=self.x) if compute_real_residual: self.residual = self.lc2(1, self.rhs, -1, self.operator(self.x)) else: self.lc2(1, self.residual, -alpha, q, out=self.residual) s = self.precon(self.residual) delta_old = self.delta delta = AsyncInnerProduct(self.residual, s, self.pagelocked_allocator) self.delta = delta.gpu_result beta = self.guarded_div(self.delta, delta_old) self.lc2(1, s, beta, self.d, out=self.d) if compute_real_residual: self.real_delta_queue.append(delta)
def norm(self): """The L2-norm on the flattened vector.""" if self.state is DeviceDataMixin.DEVICE: return np.sqrt(gpuarray.dot(self.array, self.array).get()) elif self.state in [DeviceDataMixin.DEVICE_UNALLOCATED, DeviceDataMixin.HOST, DeviceDataMixin.BOTH]: return np.sqrt(np.dot(self.data_ro, self.data_ro)) else: raise RuntimeError('Data neither on host nor device, oops!')
def magnitude(vec, vec2): #, fn = mod.get_function('magnitude')): #gpu_vec = drv.mem_alloc(vec.nbytes) #drv.memcpy_htod(gpu_vec, vec) #fn(gpu_vec, block=(512, 1, 1)) #dest = drv.from_device_like(gpu_vec, vec) #print 'Dot product: ', dest[0] gpu_arry = gpuarr.to_gpu_async(vec) gpu_arry2 = gpuarr.to_gpu_async(vec2) mag = cumath.sqrt(gpuarr.dot(gpu_arry, gpu_arry, dtype=np.float32)) mag2 = cumath.sqrt(gpuarr.dot(gpu_arry2, gpu_arry2, dtype=np.float32)) product = gpuarr.dot(gpu_arry, gpu_arry2, dtype=np.float32) / mag + mag2 print product return product.get()
def __call__(self, tcurr, nsteps, solprev, solcurr): if ((self.nsteps > 0 and ((nsteps % self.nsteps == 0) or nsteps == 1))): # or (self.dt_out>0 and abs(tcurr % self.dt_out) < 1e-8)): comm, rank, root = get_comm_rank_root() diff = solcurr - solprev res = np.array([ gpuarray.dot(diff, diff).get(), gpuarray.dot(solprev, solprev).get() ]) if rank != root: comm.Reduce(res, None, op=get_mpi('sum'), root=root) else: comm.Reduce(get_mpi('in_place'), res, op=get_mpi('sum'), root=root) print("residual at t = ", tcurr, np.sqrt(res[0] / res[1]))
def test_dot(self): from pycuda.curandom import rand as curand a_gpu = curand((200000, )) a = a_gpu.get() b_gpu = curand((200000, )) b = b_gpu.get() dot_ab = np.dot(a, b) dot_ab_gpu = gpuarray.dot(a_gpu, b_gpu).get() assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4
def test_dot(self): from pycuda.curandom import rand as curand a_gpu = curand((200000,)) a = a_gpu.get() b_gpu = curand((200000,)) b = b_gpu.get() dot_ab = numpy.dot(a, b) dot_ab_gpu = gpuarray.dot(a_gpu, b_gpu).get() assert abs(dot_ab_gpu-dot_ab)/abs(dot_ab) < 1e-4
def test_dot(self): from pycuda.curandom import rand as curand for l in [2, 3, 4, 5, 6, 7, 31, 32, 33, 127, 128, 129, 255, 256, 257, 16384 - 993, 20000]: a_gpu = curand((l,)) a = a_gpu.get() b_gpu = curand((l,)) b = b_gpu.get() dot_ab = np.dot(a, b) dot_ab_gpu = gpuarray.dot(a_gpu, b_gpu).get() assert abs(dot_ab_gpu - dot_ab) / abs(dot_ab) < 1e-4
def dotc_gpu(x, y=None): """Calculate complex dot product on GPU. If y is not provided, <x, x> is calculated instead. Args: x (ndarray): Vector. y (ndarray): Vector. Returns: ndarray: Absolute of complex dot product. """ if y is None: y = x return np.abs(gpuarray.dot(x.ravel(), y.ravel().conj()).get())
def test_dot(self): """ Test dot-product. """ dtypes = [numpy.float32, numpy.float64, numpy.complex64, numpy.complex128] for dtype in dtypes: for shape in self.shapes: x = gpuarray.to_gpu(numpy.random.randn(*shape).astype(dtype)) y = gpuarray.to_gpu(numpy.random.randn(*shape).astype(dtype)) dot_cpu = numpy.dot(x.get().flatten(), y.get().flatten()) dot_gpu = gpuarray.dot(x, y).get() percent_error = abs(dot_cpu-dot_gpu)/abs(dot_cpu)*100 # print 'shape:', shape # print 'data type:', dtype # print 'numpy computed dot product:', dot_cpu # print 'gpuarray computed dot product:', dot_gpu # print 'percent error:', percent_error, '%' # print '\n' self.assertTrue(percent_error < 10.0, 'Error above 10%.')
def test_dot(self): """ Test dot-product. """ dtypes = [ numpy.float32, numpy.float64, numpy.complex64, numpy.complex128 ] for dtype in dtypes: for shape in self.shapes: x = gpuarray.to_gpu(numpy.random.randn(*shape).astype(dtype)) y = gpuarray.to_gpu(numpy.random.randn(*shape).astype(dtype)) dot_cpu = numpy.dot(x.get().flatten(), y.get().flatten()) dot_gpu = gpuarray.dot(x, y).get() percent_error = abs(dot_cpu - dot_gpu) / abs(dot_cpu) * 100 # print 'shape:', shape # print 'data type:', dtype # print 'numpy computed dot product:', dot_cpu # print 'gpuarray computed dot product:', dot_gpu # print 'percent error:', percent_error, '%' # print '\n' self.assertTrue(percent_error < 10.0, 'Error above 10%.')
def amplitude_compute_gpu(vector, atom_factors, frame): f_a_real = 0 f_a_imag = 0 f_frame = [] f_factor = [] for atom in frame: f_frame.append([atom[1], atom[2], atom[3]]) for factors in atom_factors: if factors[0] == atom[4]: f_factor.append(factors[1]) n_vector = numpy.asarray(vector) n_frame = numpy.asarray(f_frame) for i in range(0, len(n_frame)): gpu_vector = gpuarray.to_gpu(n_vector) gpu_frame = gpuarray.to_gpu(n_frame[i]) gpu_result = gpuarray.dot(gpu_vector, gpu_frame) gpu_sin = gpumath.sin(gpu_result) gpu_cos = gpumath.cos(gpu_result) f_q = f_factor[i] f_a_real += f_q * gpu_cos f_a_imag += f_q * gpu_sin return f_a_real, f_a_imag
def compute_obj(self, f): res_gpu = self.X.cnv(f)-self.y return 0.5 * cua.dot(res_gpu,res_gpu)
def _perform_dot(self, v1, v2): return dot(v1, v2)
def compute_grad(self, x): return cua.dot(self.A.T, cua.dot(self.A,x) - self.b)
def __init__(self, objective, x_init, options): self.objective = objective self.options = options self.time_start = time.clock() self.iter = 0 self.status = 'Failure' # ------------------------------------------ # Initialisation # ----------------------------------------- self.initialisation(x_init) # ------------------------------------------ # Sanity checks # ----------------------------------------- if np.sqrt(cua.dot(self.x, self.x).get()) < 1e-12: raise IOError('Initial vector close to zero. Cannot proceed') # ------------------------------------------ # Prime the pump # ----------------------------------------- if options.verbose: print 'Running Projected Barzilai Borwein:\n' # ------------------------------------------ # Main iterative loop # ----------------------------------------- for i in range(options.maxiter): self.iter += 1 self.show_status() dx = self.x - self.oldx dg = self.g - self.oldg if not options.unconstrained: clip2bound(dx, self.x, self.g) clip2bound(dg, self.x, self.g) self.dx = dx self.dg = dg # Check termination criteria self.check_termination() if self.term_reason: break # store x & gradient self.oldx = self.x self.oldg = self.g # update x & gradient if (np.mod(self.iter, 2) == 0): step = (cua.sum(dx * dx) / (0.00001 + cua.sum(dx * dg))).get() else: step = (cua.sum(dx * dg) / (0.00001 + cua.sum(dg * dg))).get() self.x = self.x - self.g * step if not options.unconstrained: gputools.cliplower_GPU(self.x, 0) # projection if options.compute_both: self.oldobj = self.obj self.obj, self.g = objective.compute_both(self.x) elif options.compute_obj: self.g = objective.compute_grad(self.x) self.oldobj = self.obj self.obj = objective.compute_obj(self.x) else: self.g = objective.compute_grad(self.x) # ------------------------------------------ # Final statistics and wrap up # ----------------------------------------- self.time = time.clock() - self.time_start self.status = 'Success' if self.options.verbose: print self.status print self.term_reason print 'Done\n' self.result = self.x
start.record() dev_expx = cumath.exp(dev_x) end.record() end.synchronize() print "GPU array calc time: %fs" % (start.time_till(end) * 1e-3) start.record() exp_x = np.exp(x) end.record() end.synchronize() print "CPU calc time: %fs" % (start.time_till(end) * 1e-3) print "Timing vectorized dot product/sum of squares:" start.record() gpuarray.dot(dev_x_short, dev_x_short) end.record() end.synchronize() print "GPU array calc time (initial): %fs" % (start.time_till(end) * 1e-3) start.record() gpuarray.dot(dev_x, dev_x) end.record() end.synchronize() print "GPU array calc time: %fs" % (start.time_till(end) * 1e-3) start.record() np.dot(x, x) end.record() end.synchronize() print "CPU calc time: %fs" % (start.time_till(end) * 1e-3)
start.record() dev_expx = cumath.exp(dev_x) end.record() end.synchronize() print "GPU array calc time: %fs" %(start.time_till(end)*1e-3) start.record() exp_x = np.exp(x) end.record() end.synchronize() print "CPU calc time: %fs" %(start.time_till(end)*1e-3) print "Timing vectorized dot product/sum of squares:" start.record() gpuarray.dot(dev_x_short,dev_x_short) end.record() end.synchronize() print "GPU array calc time (initial): %fs" %(start.time_till(end)*1e-3) start.record() gpuarray.dot(dev_x,dev_x) end.record() end.synchronize() print "GPU array calc time: %fs" %(start.time_till(end)*1e-3) start.record() np.dot(x, x) end.record() end.synchronize() print "CPU calc time: %fs" %(start.time_till(end)*1e-3)
print "Transfer to GPU time: %fs" %(start.time_till(end)*1e-3) print "Timing vectorized exponentiation:" start.record() dexpX = cumath.exp(dX) end.record() end.synchronize() print "GPU array calc time: %fs" %(start.time_till(end)*1e-3) start.record() expX = np.exp(x) end.record() end.synchronize() print "CPU calc time: %fs" %(start.time_till(end)*1e-3) print "Timing vectorized dot product/sum of squares:" start.record() gpuarray.dot(dX,dX) end.record() end.synchronize() print "GPU array calc time: %fs" %(start.time_till(end)*1e-3) start.record() np.dot(x, x) end.record() end.synchronize() print "CPU calc time: %fs" %(start.time_till(end)*1e-3)
def conjugate_gradient(self, init_delta, grad, iters=250, printing=False): """Find minimum of quadratic approximation using conjugate gradient algorithm.""" if self.net.debug: self.net.check_grad(grad) store_iter = 5 store_mult = 1.3 deltas = [] grad = -grad # note negative, some CG algorithms are flipped vals = np.zeros(iters, dtype=self.net.dtype) if self.net.use_GPU: from pycuda import gpuarray base_grad = gpuarray.to_gpu(grad) delta = gpuarray.to_gpu(init_delta) G_dir = gpuarray.zeros(grad.shape, dtype=self.net.dtype) dot = lambda a, b: gpuarray.dot(a, b).get() get = lambda x: x.get(pagelocked=True) self.calc_G = self.net.GPU_calc_G else: base_grad = grad delta = init_delta G_dir = np.zeros_like(grad) dot = np.dot get = lambda x: x.copy() self.calc_G = self.net.calc_G residual = base_grad.copy() residual -= self.calc_G(delta, damping=self.damping, out=G_dir) res_norm = dot(residual, residual) direction = residual.copy() for i in range(iters): if printing: print "-" * 20 print "CG iteration", i print "delta norm", np.linalg.norm(get(delta)) print "direction norm", np.linalg.norm(get(direction)) self.calc_G(direction, damping=self.damping, out=G_dir) # calculate step size step = res_norm / dot(direction, G_dir) if not np.isfinite(step): warnings.warn("Non-finite step value (%f)" % step) step = np.nan_to_num(step) if printing: print "G_dir norm", np.linalg.norm(get(G_dir)) print "step", step if self.net.debug: tmp_G_dir = get(G_dir) tmp_dir = get(direction) self.net.check_G(tmp_G_dir, tmp_dir, self.damping) assert np.isfinite(step) assert step >= 0 assert (np.linalg.norm(np.dot( tmp_dir, tmp_G_dir)) >= np.linalg.norm( np.dot(tmp_dir, self.net.calc_G(tmp_dir, damping=0)))) # update weight delta delta += step * direction # update residual residual -= step * G_dir new_res_norm = dot(residual, residual) if new_res_norm < 1e-20: # early termination (mainly to prevent numerical errors); # the main termination condition is below. break # update direction beta = new_res_norm / res_norm direction *= beta direction += residual res_norm = new_res_norm # store deltas for backtracking if i == store_iter: deltas += [(i, get(delta))] store_iter = int(store_iter * store_mult) # martens termination conditions vals[i] = -0.5 * dot(residual + base_grad, delta) gap = max(int(0.1 * i), 10) if printing: print "termination val", vals[i] if (i > gap and vals[i - gap] < 0 and (vals[i] - vals[i - gap]) / vals[i] < 5e-6 * gap): break deltas += [(i, get(delta))] return deltas
end.synchronize() print "Transfer to GPU time: %fs" % (start.time_till(end) * 1e-3) print "Timing vectorized exponentiation:" start.record() dexpX = cumath.exp(dX) end.record() end.synchronize() print "GPU array calc time: %fs" % (start.time_till(end) * 1e-3) start.record() expX = np.exp(x) end.record() end.synchronize() print "CPU calc time: %fs" % (start.time_till(end) * 1e-3) print "Timing vectorized dot product/sum of squares:" start.record() gpuarray.dot(dX, dX) end.record() end.synchronize() print "GPU array calc time: %fs" % (start.time_till(end) * 1e-3) start.record() np.dot(x, x) end.record() end.synchronize() print "CPU calc time: %fs" % (start.time_till(end) * 1e-3)
def compute_obj(self, f): res_gpu = self.X.cnv(f) - self.y return 0.5 * cua.dot(res_gpu, res_gpu)
def kernel(a, b): from pycuda.gpuarray import dot return dot(a, b).get()
import pycuda.driver as cuda import pycuda.autoinit import numpy import time n = 4 a = numpy.float32(numpy.random.randn(n,n)) b = numpy.float32(numpy.random.randn(n,n)) for i in range(n): for j in range(n): a[i,j] = i+j b[i,j] = i+j tic = time.time() axb = a*b print a print b print "====" print numpy.dot(a,b) toc = time.time() - tic print toc, "s for CPU" tic = time.time() a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) axbGPU = gpuarray.dot(a_gpu,b_gpu) print "====" print axbGPU toc=time.time()-tic print toc,"s for GPU"
def dot(a, b): return gpuarray.dot(a, b).get()
def norm(self): """The L2-norm on the flattened vector.""" return np.sqrt(gpuarray.dot(self.array, self.array).get())
if (self.options.use_tolg): nr = cua.max(cua.fabs(self.grad)).get(); if (nr < self.options.tolg): self.term_reason = '|| grad ||_inf < opt.tolg' return # No condition met, so return false self.term_reason = 0; if __name__ == '__main__': case = 2 if case == 1: A = curand.rand((10000,1000)) xt = curand.rand((1000,1)) b = cua.dot(A, xt) x_init = cua.empty_like(xt) x_init.fill(0.1) # Set up objective objective = MVM_Objective(A,b) # Default optimization options opt = Solopt() pbb = PBB(objective, x_init, opt); elif case == 2: x = pylab.imread('lena.png')
def __init__(self, objective, x_init, options): self.objective = objective self.options = options self.time_start = time.clock() self.iter = 0 self.status = 'Failure' # ------------------------------------------ # Initialisation # ----------------------------------------- self.initialisation(x_init) # ------------------------------------------ # Sanity checks # ----------------------------------------- if np.sqrt(cua.dot(self.x, self.x).get()) < 1e-12: raise IOError('Initial vector close to zero. Cannot proceed'); # ------------------------------------------ # Prime the pump # ----------------------------------------- if options.verbose: print 'Running Projected Barzilai Borwein:\n' # ------------------------------------------ # Main iterative loop # ----------------------------------------- for i in range(options.maxiter): self.iter += 1 self.show_status() dx = self.x - self.oldx dg = self.g - self.oldg if not options.unconstrained: clip2bound(dx, self.x, self.g) clip2bound(dg, self.x, self.g) self.dx = dx self.dg = dg # Check termination criteria self.check_termination() if self.term_reason: break # store x & gradient self.oldx = self.x self.oldg = self.g # update x & gradient if (np.mod(self.iter, 2) == 0): step = (cua.sum(dx*dx) / (0.00001+cua.sum(dx*dg))).get() else: step = (cua.sum(dx*dg) / (0.00001+cua.sum(dg*dg))).get() self.x = self.x - self.g * step if not options.unconstrained: gputools.cliplower_GPU(self.x, 0) # projection if options.compute_both: self.oldobj = self.obj self.obj, self.g = objective.compute_both(self.x); elif options.compute_obj: self.g = objective.compute_grad(self.x) self.oldobj = self.obj; self.obj = objective.compute_obj(self.x); else: self.g = objective.compute_grad(self.x) # ------------------------------------------ # Final statistics and wrap up # ----------------------------------------- self.time = time.clock() - self.time_start self.status = 'Success' if self.options.verbose: print self.status print self.term_reason print 'Done\n' self.result = self.x
def compute_obj(self, x): return 0.5 * cua.dot( cua.dot(self.A,x) - self.b )
def compute_grad(self, x): return cua.dot(self.A.T, cua.dot(self.A, x) - self.b)
def conjugate_gradient(self, init_delta, grad, iters=250, printing=False): """Find minimum of quadratic approximation using conjugate gradient algorithm.""" if self.net.debug: self.net.check_grad(grad) store_iter = 5 store_mult = 1.3 deltas = [] grad = -grad # note negative, some CG algorithms are flipped vals = np.zeros(iters, dtype=self.net.dtype) if self.net.use_GPU: from pycuda import gpuarray base_grad = gpuarray.to_gpu(grad) delta = gpuarray.to_gpu(init_delta) G_dir = gpuarray.zeros(grad.shape, dtype=self.net.dtype) dot = lambda a, b: gpuarray.dot(a, b).get() get = lambda x: x.get(pagelocked=True) self.calc_G = self.net.GPU_calc_G else: base_grad = grad delta = init_delta G_dir = np.zeros_like(grad) dot = np.dot get = lambda x: x.copy() self.calc_G = self.net.calc_G residual = base_grad.copy() residual -= self.calc_G(delta, damping=self.damping, out=G_dir) res_norm = dot(residual, residual) direction = residual.copy() for i in range(iters): if printing: print "-" * 20 print "CG iteration", i print "delta norm", np.linalg.norm(get(delta)) print "direction norm", np.linalg.norm(get(direction)) self.calc_G(direction, damping=self.damping, out=G_dir) # calculate step size step = res_norm / dot(direction, G_dir) if not np.isfinite(step): warnings.warn("Non-finite step value (%f)" % step) step = np.nan_to_num(step) if printing: print "G_dir norm", np.linalg.norm(get(G_dir)) print "step", step if self.net.debug: tmp_G_dir = get(G_dir) tmp_dir = get(direction) self.net.check_G(tmp_G_dir, tmp_dir, self.damping) assert np.isfinite(step) assert step >= 0 assert (np.linalg.norm(np.dot(tmp_dir, tmp_G_dir)) >= np.linalg.norm(np.dot(tmp_dir, self.net.calc_G(tmp_dir, damping=0)))) # update weight delta delta += step * direction # update residual residual -= step * G_dir new_res_norm = dot(residual, residual) if new_res_norm < 1e-20: # early termination (mainly to prevent numerical errors); # the main termination condition is below. break # update direction beta = new_res_norm / res_norm direction *= beta direction += residual res_norm = new_res_norm # store deltas for backtracking if i == store_iter: deltas += [(i, get(delta))] store_iter = int(store_iter * store_mult) # martens termination conditions vals[i] = -0.5 * dot(residual + base_grad, delta) gap = max(int(0.1 * i), 10) if printing: print "termination val", vals[i] if (i > gap and vals[i - gap] < 0 and (vals[i] - vals[i - gap]) / vals[i] < 5e-6 * gap): break deltas += [(i, get(delta))] return deltas
def compute_obj(self, x): return 0.5 * cua.dot(cua.dot(self.A, x) - self.b)
def minimize_batch(self, batch_size, eta, opt_method): #using a batch_gradient descent method cost = 0.0 eps = 1e-8 # for use in adagrad for lower_index in range(0, len(self.nonzeros), batch_size): upper_index = min(lower_index + batch_size, len(self.nonzeros)) # index of first element after the end of this batch batch = [self.nonzeros[k] for k in range(lower_index, upper_index)] batch_i = [index[0] for index in batch] batch_j = [index[1] for index in batch] cur_batch_len = np.int32(upper_index - lower_index) batch_i_gpu = gpuarray.to_gpu(np.array(batch_i, dtype=np.int32)) batch_j_gpu = gpuarray.to_gpu(np.array(batch_j, dtype=np.int32)) cost_inner = gpuarray.zeros(batch_size, dtype=np.float32) weighted_cost_inner = gpuarray.zeros_like(cost_inner) # calculate intermediate values # cost_inner = + self.b[batch_i] + \ # self.b_tilde[batch_j] - np.log(np.array([self.cooccurrence_mat[k] for k in range(lower_index, upper_index)])) batchMatColDot(cur_batch_len, self.v_dim, self.W, self.W_tilde, batch_i_gpu, batch_j_gpu, cost_inner, \ block=(self.blockDim_x, self.blockDim_y, 1), grid=(self.numBlocks_x, self.numBlocks_y)) context.synchronize() if lower_index == 0: print cost_inner.get() batchCostInner(np.int32(lower_index), np.int32(upper_index), cost_inner, self.b, self.b_tilde, \ self.cooccurrence_mat, batch_i_gpu, batch_j_gpu, block=(self.blockDim, 1, 1), grid=(self.numBlocks, 1)) if lower_index == 0: print cost_inner.get() context.synchronize() # weighted_cost_inner = np.array([self.f_x[k] for k in range(lower_index_upper_index)]) * cost_inner batchWeightedInnerCost(np.int32(lower_index), np.int32(upper_index), self.f_x, cost_inner, weighted_cost_inner, \ block=(self.blockDim, 1, 1), grid=(self.numBlocks, 1)) if lower_index == 0: print weighted_cost_inner.get() context.synchronize() # calculate the gradients of each parameter # self.gradW[batch_i] = (self.W_tilde[batch_j].T * weighted_cost_inner).T batchMatVecRowMult(cur_batch_len, self.v_dim, self.W_tilde, weighted_cost_inner, self.gradW, batch_j_gpu, batch_i_gpu, \ block=(self.blockDim_x, self.blockDim_y, 1), grid=(self.numBlocks_x, self.numBlocks_y)) # self.gradW_tilde[batch_j] = (self.W[batch_i].T * weighted_cost_inner).T batchMatVecRowMult(cur_batch_len, self.v_dim, self.W, weighted_cost_inner, self.gradW_tilde, batch_i_gpu, batch_j_gpu, \ block=(self.blockDim_x, self.blockDim_y, 1), grid=(self.numBlocks_x, self.numBlocks_y)) # self.gradb[batch_i] = self.gradb_tilde[batch_j] = weighted_cost_inner batchCopyVector(cur_batch_len, weighted_cost_inner, self.b, batch_i_gpu, \ block=(self.blockDim, 1, 1), grid=(self.numBlocks, 1)) batchCopyVector(cur_batch_len, weighted_cost_inner, self.b_tilde, batch_j_gpu, \ block=(self.blockDim, 1, 1), grid=(self.numBlocks, 1)) context.synchronize() # perform the main parameter updates # self.W[batch_i] -= eta * self.gradW[batch_i] batchMatSubtractInplace(cur_batch_len, self.v_dim, eta, self.W, self.gradW, batch_i_gpu, \ block=(self.blockDim_x, self.blockDim_y, 1), grid=(self.numBlocks_x, self.numBlocks_y)) # self.W_tilde[batch_j] -= eta * self.gradW_tilde[batch_j] batchMatSubtractInplace(cur_batch_len, self.v_dim, eta, self.W_tilde, self.gradW_tilde, batch_j_gpu, \ block=(self.blockDim_x, self.blockDim_y, 1), grid=(self.numBlocks_x, self.numBlocks_y)) # self.b[batch_i] -= eta * self.gradb[batch_i] batchVecSubtractInplace(cur_batch_len, eta, self.b, self.gradb, batch_i_gpu, \ block=(self.blockDim, 1, 1), grid=(self.numBlocks, 1)) # self.b_tilde[batch_j] -= eta * self.gradb_tilde[batch_j] batchVecSubtractInplace(cur_batch_len, eta, self.b_tilde, self.gradb_tilde, batch_j_gpu, \ block=(self.blockDim, 1, 1), grid=(self.numBlocks, 1)) context.synchronize() cost += gpuarray.dot(weighted_cost_inner, cost_inner).get() return cost
def get_purity(self): """ Return the purity of the current Wigner function, 2*np.pi*np.sum(W**2)*dXdP :return: float """ return 2. * np.pi * gpuarray.dot(self.wignerfunction, self.wignerfunction).get().real * self.dXdP
import pycuda.gpuarray as gpuarray import pycuda.driver as cuda import pycuda.autoinit import numpy import time r = [2, 5000] for n in r: a = numpy.float32(numpy.random.randn(n, n)) b = numpy.float32(numpy.random.randn(n, n)) a_gpu = gpuarray.to_gpu(a) b_gpu = gpuarray.to_gpu(b) tic = time.time() axbGPU = gpuarray.dot(a_gpu, b_gpu) toc = time.time() - tic print(toc)
import pycuda.driver as cuda import pycuda.gpuarray as gpuArray import pycuda.autoinit import numpy if len(sys.argv) != 4: print("Usage: python3 dot_cuda.py <n_workers> <work_size> <repetitions>") exit(1) n_workers = int(sys.argv[1]) work_size = int(sys.argv[2]) repetitions = int(sys.argv[3]) t1 = time.perf_counter() vec_a = numpy.float32(numpy.array([0.01 for i in range(work_size*n_workers)])) vec_b = numpy.float32(numpy.array([1.00 for i in range(work_size*n_workers)])) gpu_a = gpuArray.to_gpu(vec_a) gpu_b = gpuArray.to_gpu(vec_b) t_aloc = time.perf_counter() - t1 t2 = time.perf_counter() dot = gpuArray.dot(gpu_a, gpu_b) t_proc = t2-time.perf_counter() print(dot) print("Tempo Alocacao: " + str(t_aloc)) print("Tempo Calculos: " + str(t_calc))
if (self.options.use_tolg): nr = cua.max(cua.fabs(self.grad)).get() if (nr < self.options.tolg): self.term_reason = '|| grad ||_inf < opt.tolg' return # No condition met, so return false self.term_reason = 0 if __name__ == '__main__': case = 2 if case == 1: A = curand.rand((10000, 1000)) xt = curand.rand((1000, 1)) b = cua.dot(A, xt) x_init = cua.empty_like(xt) x_init.fill(0.1) # Set up objective objective = MVM_Objective(A, b) # Default optimization options opt = Solopt() pbb = PBB(objective, x_init, opt) elif case == 2: x = pylab.imread('lena.png')