def add_ancilla(self, anc_st): """Add an ancilla in the ground or excited state as the highest new bit. """ byte_size_of_smaller_dm = 2**(2 * self.no_qubits) * 8 if self.allocated_qubits == self.no_qubits: # allocate larger memory new_dm = ga.zeros(self._size * 4, np.float64) offset = anc_st * 3 * byte_size_of_smaller_dm drv.memcpy_dtod(int(new_dm.gpudata) + offset, self.data.gpudata, byte_size_of_smaller_dm) self.data = new_dm else: # reuse previously allocated memory if anc_st == 0: drv.memset_d8(int(self.data.gpudata) + byte_size_of_smaller_dm, 0, 3 * byte_size_of_smaller_dm) if anc_st == 1: drv.memcpy_dtod(int(self.data.gpudata) + 3 * byte_size_of_smaller_dm, self.data.gpudata, byte_size_of_smaller_dm) drv.memset_d8(self.data.gpudata, 0, 3 * byte_size_of_smaller_dm) self._set_no_qubits(self.no_qubits + 1)
def stepFunction(): global animIter if showActivity: cuda.memset_d8(activeBlocks_d.ptr, 0, nBlocks) findActivityKernel(cudaPre(1.e-10), concentrationIn_d, activeBlocks_d, grid=grid2D, block=block2D) getActivityKernel(activeBlocks_d, activeThreads_d, grid=grid2D, block=block2D) cuda.memcpy_dtod(plotData_d.ptr, concentrationOut_d.ptr, concentrationOut_d.nbytes) maxVal = gpuarray.max(plotData_d).get() scalePlotData(100. / maxVal, plotData_d, np.uint8(showActivity), activeThreads_d) if cudaP == "float": [oneIteration_tex() for i in range(nIterationsPerPlot)] else: [oneIteration_sh() for i in range(nIterationsPerPlot // 2)] if plotting and animIter % 25 == 0: maxVals.append(maxVal) sumConc.append(gpuarray.sum(concentrationIn_d).get()) plotData(maxVals, sumConc) animIter += 1
def _read_external_input(self): # if eof not reached or there are frames in buffer not read # copy the input from buffer to synapse state array if not self.input_eof or self.frame_count < self.frames_in_buffer: cuda.memcpy_dtod( int(int(self.synapse_state.gpudata) + self.total_synapses * self.synapse_state.dtype.itemsize), int(int(self.I_ext.gpudata) + self.frame_count * self.I_ext.ld * self.I_ext.dtype.itemsize), self.num_input * self.synapse_state.dtype.itemsize, ) self.frame_count += 1 else: self.log_info("Input end of file reached. " "Subsequent behaviour is undefined.") # if all buffer frames were read, read from file if self.frame_count >= self._one_time_import and not self.input_eof: input_ld = self.input_h5file.root.array.shape[0] if input_ld - self.file_pointer < self._one_time_import: h_ext = self.input_h5file.root.array.read(self.file_pointer, input_ld) else: h_ext = self.input_h5file.root.array.read(self.file_pointer, self.file_pointer + self._one_time_import) if h_ext.shape[0] == self.I_ext.shape[0]: self.I_ext.set(h_ext) self.file_pointer += self._one_time_import self.frame_count = 0 else: pad_shape = list(h_ext.shape) self.frames_in_buffer = h_ext.shape[0] pad_shape[0] = self._one_time_import - h_ext.shape[0] h_ext = np.concatenate((h_ext, np.zeros(pad_shape)), axis=0) self.I_ext.set(h_ext) self.file_pointer = input_ld if self.file_pointer == self.input_h5file.root.array.shape[0]: self.input_eof = True
def cache_z(self, z): x = np.require(z.real, dtype = np.double, requirements = ['A','W','O','C']) y = np.require(z.imag, dtype = np.double, requirements = ['A','W','O','C']) xd = gpuarray.to_gpu(x) yd = gpuarray.to_gpu(y) cuda.memcpy_dtod(self.xd, xd.ptr, xd.nbytes) cuda.memcpy_dtod(self.yd, yd.ptr, yd.nbytes)
def copy_rows(self, start, stop, step=1): nrows = len(range(start, stop, step)) if nrows: if self.ndim == 2: shape = (nrows, self.shape[1]) else: shape = (nrows, self.shape[1], self.shape[2]) else: if self.ndim == 2: shape = (nrows, 0) else: shape = (nrows, 0, 0) result = PitchArray(shape, self.dtype) if nrows > 1: PitchTrans( shape, result.gpudata, result.ld, int(self.gpudata) + start * self.ld * self.dtype.itemsize, self.ld * step, self.dtype) elif nrows == 1: cuda.memcpy_dtod( result.gpudata, int(self.gpudata) + start * self.ld * self.dtype.itemsize, self.dtype.itemsize * _pd(shape)) return result
def swapHashTableValues(new_vals): table_vals, table_vals_size = mod.get_global('table_values') # (device_ptr, size_in_bytes) old_vals_gpu = cuda.mem_alloc(table_vals_size) # old_vals_gpu = gpuarray.empty((table_vals_size,1), ) cuda.memcpy_dtod(old_vals_gpu, table_vals, table_vals_size) cuda.memcpy_dtod(table_vals, new_vals.gpudata, table_vals_size) return old_vals_gpu
def copy(self): if not self.flags.forc: raise RuntimeError("only contiguous arrays may copied.") new = GPUArray(self.shape, self.dtype) drv.memcpy_dtod(new.gpudata, self.gpudata, self.nbytes) return new
def execute(self): ready = self.comm.recv(source=self.op.source_id, tag=TAG_SCATTER) if ready: drv.memcpy_dtod(self.tensor.tensor.gpudata, self.sender_buf, self.tensor.tensor.size * self.op.dtype.itemsize) else: raise RuntimeError("Synchronization failed!")
def _gpuarray_copy(array): if not array.flags.forc: raise RuntimeError('only contiguous arrays may copied.') new = GPUArray(array.shape, array.dtype, allocator=array.allocator) drv.memcpy_dtod(new.gpudata, array.gpudata, array.nbytes) return new
def execute(self): # Push our fragment into its section of the larger recvr buffer, which assumes gather axis # is least contiguous. if self.comm.Get_rank() > 0: drv.memcpy_dtod(self.recvr_buf, self.tensor.tensor.gpudata, self.tensor.tensor.size * self.op.dtype.itemsize) self.comm.barrier()
def execute(self): sender_ready = drv.from_device(self.sender_ready, (1, ), np.int8) while (sender_ready == 0): sender_ready = drv.from_device(self.sender_ready, (1, ), np.int8) drv.memcpy_dtod(self.tensor.tensor.gpudata, self.sender_buf, self.tensor.tensor.size * self.op.dtype.itemsize) drv.memset_d8(self.sender_ready, 0, 1)
def leapfrogStationary(d_x, d_t, v, xmin, xmax, alpha): # --- Allocate device memory space for solution d_u = cuda.mem_alloc((N + 1) * (M + 1) * 4) d_u1 = cuda.mem_alloc((N + 1) * 4) d_u2 = cuda.mem_alloc((N + 1) * 4) d_u3 = cuda.mem_alloc((N + 1) * 4) # --- Set memory to zero cuda.memset_d32(d_u , 0x00, (N + 1) * (M + 1)) cuda.memset_d32(d_u1, 0x00, (N + 1) ) cuda.memset_d32(d_u2, 0x00, (N + 1) ) cuda.memset_d32(d_u3, 0x00, (N + 1) ) # u = np.zeros(((M + 1), N + 1)) blockDim = (BLOCKSIZE, 1, 1) gridDim = (int(iDivUp(N + 1, BLOCKSIZE)), 1, 1) # --- Step0 setStep0(d_u1, d_u, d_t, d_x, np.float32(v), np.float32(xmin), np.float32(xmax), np.int32(N), block = blockDim, grid = gridDim) # --- Step1 setStep1(d_u1, d_u2, d_u, d_t, d_x, np.float32(v), np.float32(xmin), np.float32(xmax), np.float32(alpha), np.float32(dt), np.int32(N), block = blockDim, grid = gridDim) for l in range(1, M - 1): updateShared(d_u1, d_u2, d_u3, d_u, d_t, d_x, np.float32(v), np.float32(xmin), np.float32(xmax), np.float32(alpha), np.int32(l), np.int32(N), block = blockDim, grid = gridDim) # updateNoShared(d_u1, d_u2, d_u3, d_u, d_t, d_x, np.float32(v), np.float32(xmin), np.float32(xmax), np.float32(alpha), np.int32(l), np.int32(N), block = blockDim, grid = gridDim) # updateNoSharedNotWorking(d_u1, d_u2, d_u3, d_u, d_t, d_x, np.float32(v), np.float32(xmin), np.float32(xmax), np.float32(alpha), np.int32(l), np.int32(N), block = blockDim, grid = gridDim) cuda.memcpy_dtod(d_u1, d_u2, (N + 1) * 4) cuda.memcpy_dtod(d_u2, d_u3, (N + 1) * 4) return d_u
def _assign(self, value): if isinstance(value, (int, float)): # if we have a c or f contiguous array, then use the speedy driver kernel if self.flags.forc and float(value) >= 0: drv.memset_d16(self.gpudata, Flexpt.flex_from_native(value, self.iwl), self.size) # otherwise use our copy kerel else: OpTreeNode.build("copy", value, None, out=self) elif isinstance(value, FlexptArray): if self.flags.forc and value.flags.forc and self.iwl == value.iwl: drv.memcpy_dtod(self.gpudata, value.gpudata, self.nbytes) else: OpTreeNode.build("copy", value, None, out=self) elif isinstance(value, OpTreeNode): value.execute(out=self) else: raise TypeError("Invalid type for assignment: %s" % type(value)) return self
def run_step(self, update_pointers, st=None): self.sum_in_variable('individual_input', self.inputs['individual_input'], st=st) cuda.memcpy_dtod(int(update_pointers['I']), self.inputs['individual_input'].gpudata, self.inputs['individual_input'].nbytes)
def _copy_memory_dtod(self, index): # downsample index d_index = int(index / self.rate) # buffer index b_index = d_index % self.buffer_length for key in self.spike_vars: src = getattr(self.obj, key) self.gpu_dct[key][b_index] += src if index % self.rate != 0: return for key in self.dct.keys(): src = getattr(self.obj, key) if key in self.spike_vars: continue else: dst = int(self.gpu_dct[key].gpudata) + b_index * src.nbytes cuda.memcpy_dtod(dst, src.gpudata, src.nbytes) # dump data to CPU if simulation complete or buffer full if (d_index == self.steps - 1) or (b_index == self.buffer_length - 1): for key in self.dct.keys(): buffer = self.get_buffer(key, d_index) cuda.memcpy_dtoh(buffer, self.gpu_dct[key].gpudata) for key in self.spike_vars: self.gpu_dct[key].fill(0.0)
def _read_external_input(self): if not self.input_eof or self.frame_count<self.frames_in_buffer: cuda.memcpy_dtod(int(int(self.synapse_state.gpudata) + \ self.total_synapses*self.synapse_state.dtype.itemsize), \ int(int(self.I_ext.gpudata) + self.frame_count*self.I_ext.ld*self.I_ext.dtype.itemsize), \ self.num_input * self.synapse_state.dtype.itemsize) self.frame_count += 1 else: self.logger.info('Input end of file reached. Subsequent behaviour is undefined.') if self.frame_count >= self._one_time_import and not self.input_eof: input_ld = self.input_h5file.root.array.shape[0] if input_ld - self.file_pointer < self._one_time_import: h_ext = self.input_h5file.root.array.read(self.file_pointer, input_ld) else: h_ext = self.input_h5file.root.array.read(self.file_pointer, self.file_pointer + self._one_time_import) if h_ext.shape[0] == self.I_ext.shape[0]: self.I_ext.set(h_ext) self.file_pointer += self._one_time_import self.frame_count = 0 else: pad_shape = list(h_ext.shape) self.frames_in_buffer = h_ext.shape[0] pad_shape[0] = self._one_time_import - h_ext.shape[0] h_ext = np.concatenate((h_ext, np.zeros(pad_shape)), axis=0) self.I_ext.set(h_ext) self.file_pointer = input_ld if self.file_pointer == self.input_h5file.root.array.shape[0]: self.input_eof = True
def copy(self): if not self.flags.forc: raise RuntimeError("only contiguous arrays may copied.") new = GPUArray(self.shape, self.dtype) drv.memcpy_dtod(new.gpudata,self.gpudata,self.nbytes) return new
def set(self, queue_adapter, buf, no_async=False): device_idx = queue_adapter._device_idx assert device_idx == self._device_idx self._context_adapter.activate_device(device_idx) # PyCUDA needs pointers to be passed as `numpy.number` to kernels, # but `memcpy` functions require Python `int`s. ptr = int(self._ptr) if isinstance(self._ptr, numpy.number) else self._ptr if isinstance(buf, numpy.ndarray): if no_async: pycuda_driver.memcpy_htod(ptr, buf) else: pycuda_driver.memcpy_htod_async( ptr, buf, stream=queue_adapter._pycuda_stream) else: buf_ptr = int(buf._ptr) if isinstance(buf._ptr, numpy.number) else buf._ptr if no_async: pycuda_driver.memcpy_dtod(ptr, buf_ptr, buf.size) else: pycuda_driver.memcpy_dtod_async( ptr, buf_ptr, buf.size, stream=queue_adapter._pycuda_stream)
def _read_external_input(self): if not self.input_eof or self.frame_count < self.frames_in_buffer: cuda.memcpy_dtod(int(int(self.synapse_state.gpudata) + \ self.total_synapses*self.synapse_state.dtype.itemsize), \ int(int(self.I_ext.gpudata) + self.frame_count*self.I_ext.ld*self.I_ext.dtype.itemsize), \ self.num_input * self.synapse_state.dtype.itemsize) self.frame_count += 1 else: self.logger.info( 'Input end of file reached. Subsequent behaviour is undefined.' ) if self.frame_count >= self._one_time_import and not self.input_eof: input_ld = self.input_h5file.root.array.shape[0] if input_ld - self.file_pointer < self._one_time_import: h_ext = self.input_h5file.root.array.read( self.file_pointer, input_ld) else: h_ext = self.input_h5file.root.array.read( self.file_pointer, self.file_pointer + self._one_time_import) if h_ext.shape[0] == self.I_ext.shape[0]: self.I_ext.set(h_ext) self.file_pointer += self._one_time_import self.frame_count = 0 else: pad_shape = list(h_ext.shape) self.frames_in_buffer = h_ext.shape[0] pad_shape[0] = self._one_time_import - h_ext.shape[0] h_ext = np.concatenate((h_ext, np.zeros(pad_shape)), axis=0) self.I_ext.set(h_ext) self.file_pointer = input_ld if self.file_pointer == self.input_h5file.root.array.shape[0]: self.input_eof = True
def matvec(self, v): x = v.reshape((self.D, self.D)) self.xG.set(x) #self.out2.set(self.xG) #self.out2[:] = self.xG cd.memcpy_dtod(self.out2.gpudata, self.xG.gpudata, self.xG.nbytes) out = [self.out, self.out_p] out2 = [self.out2, self.out2_p] if self.left: #Multiplying from the left, but x is a col. vector, so use mat_dagger for k in range(len(self.A1G)): if self.use_batch: eps_l_noop_batch(out2[1], self.A1G_p[k], self.A2G_p[k], out[0], self.tmp_p, self.tmp2_p, self.tmp2, self.hdl) else: eps_l_noop_strm_dev(out2[0], self.A1G[k], self.A2G[k], out[0], self.tmp, self.tmp2, self.ones, self.zeros, self.streams, self.hdl) out, out2 = out2, out Ehx = out2[0] if self.pseudo: QEQhx = Ehx - self.lG * m.adot(self.r, x) #res = QEQhx.mul_add(-sp.exp(-1.j * self.p), self.xG, 1) cb.cublasZaxpy(self.hdl, self.D**2, -sp.exp(-1.j * self.p), QEQhx.gpudata, 1, self.xG.gpudata, 1) res = self.xG else: #res = Ehx.mul_add(-sp.exp(-1.j * self.p), self.xG, 1) cb.cublasZaxpy(self.hdl, self.D**2, -sp.exp(-1.j * self.p), Ehx.gpudata, 1, self.xG.gpudata, 1) res = self.xG else: for k in range(len(self.A2G) - 1, -1, -1): if self.use_batch: eps_r_noop_batch(out2[1], self.A1G_p[k], self.A2G_p[k], out[0], self.tmp_p, self.tmp2_p, self.tmp2, self.hdl) else: eps_r_noop_strm_dev(out2[0], self.A1G[k], self.A2G[k], out[0], self.tmp, self.tmp2, self.ones, self.zeros, self.streams, self.hdl) out, out2 = out2, out Ex = out2[0] if self.pseudo: QEQx = Ex - self.rG * m.adot(self.l, x) #res = QEQx.mul_add(-sp.exp(1.j * self.p), self.xG, 1) cb.cublasZaxpy(self.hdl, self.D**2, -sp.exp(1.j * self.p), QEQx.gpudata, 1, self.xG.gpudata, 1) res = self.xG else: #res = Ex.mul_add(-sp.exp(1.j * self.p), self.xG, 1) cb.cublasZaxpy(self.hdl, self.D**2, -sp.exp(1.j * self.p), Ex.gpudata, 1, self.xG.gpudata, 1) res = self.xG return res.get().ravel()
def set_data(filenames, file_count,subb, config, count, cur, img_mean, gpu_data, gpu_data_remote, ctx, icomm,img_batch_empty): load_time = time.time() data=None # aa = config['rank']+count/subb*size # img_list = range(aa*config['file_batch_size'],(aa+1)*config['file_batch_size'],1) #print rank, img_list if config['data_source'] in ['hkl','both']: data_hkl = hkl.load(str(filenames[file_count]))# c01b data = data_hkl if config['data_source'] in ['lmdb', 'both']: data_lmdb = lmdb_load_cur(cur,config,img_batch_empty) data = data_lmdb if config['data_source']=='both': if config['rank']==0: print (rank,(data_hkl-data_lmdb)[1,0:3,1,1].tolist()) load_time = time.time()-load_time #)* sub_time = time.time() #( data = data -img_mean sub_time = time.time()-sub_time crop_time = time.time() #( for minibatch_index in range(subb): count+=1 batch_data = data[:,:,:,minibatch_index*config['batch_size']:(minibatch_index+1)*batch_size] if mode == 'train': rand_arr = get_rand3d(config['random'], count+(rank+1)*n_files*(subb)) else: rand_arr = np.float32([0.5, 0.5, 0]) batch_data = crop_and_mirror(batch_data, rand_arr, flag_batch=config['batch_crop_mirror'],cropsize=config['input_width']) gpu_data[minibatch_index].set(batch_data) crop_time = time.time() - crop_time #) #print 'load_time: %f (load %f, sub %f, crop %f)' % (load_time+crop_time+sub_time, load_time,sub_time, crop_time) # wait for computation on last file to finish msg = icomm.recv(source=MPI.ANY_SOURCE,tag=35) assert msg == "calc_finished" for minibatch_index in range(subb): # copy from preload area drv.memcpy_dtod(gpu_data_remote[minibatch_index].ptr, gpu_data[minibatch_index].ptr, gpu_data[minibatch_index].dtype.itemsize * gpu_data[minibatch_index].size ) ctx.synchronize() icomm.isend("copy_finished",dest=0,tag=55) return count
def execute(self): """ Receive tensor """ ready = self.comm.recv(source=self.source, tag=TAG_DIRECT) if ready: drv.memcpy_dtod(self.tensor.tensor.gpudata, self.sender_buf, self.tensor.tensor.size * self.buf_item_size)
def pre_run(self, update_pointers): if self.params_dict.has_key('initV'): cuda.memcpy_dtod(int(update_pointers['V']), self.params_dict['initV'].gpudata, self.params_dict['initV'].nbytes) cuda.memcpy_dtod(self.internal_states['internalV'].gpudata, self.params_dict['initV'].gpudata, self.params_dict['initV'].nbytes)
def execute(self): # gather send execution is done here send_op_tensor = self.tensor_view_from_td( self.send_op[0].args[0].tensor_description()) drv.memcpy_dtod( self.tensor.tensor.gpudata, send_op_tensor.tensor.gpudata, send_op_tensor.tensor.size * self.send_op[0].dtype.itemsize) self.comm.barrier()
def copy(self): """ returns a duplicated copy of self """ result = self._new_like_me() if self.size: cuda.memcpy_dtod(result.gpudata, self.gpudata, self.mem_size * self.dtype.itemsize) return result
def copy(self): tmp = np.ndarray((1, )) out = Array(tmp, None) out.nbytes = self.nbytes out.x = cuda.mem_alloc(self.nbytes) cuda.memcpy_dtod(out.x, self.x, out.nbytes) out.dtype = self.dtype out.shape = self.shape return out
def _loadInput(self, stim): logging.debug('loadInput') # shortcuts nrXY = self.nrX * self.nrY nrXYD = self.nrX * self.nrY * self.nrDirs # parse input assert type(stim).__module__ == "numpy", "stim must be numpy array" assert type(stim).__name__ == "ndarray", "stim must be numpy.ndarray" assert stim.size > 0, "stim cannot be []" stim = stim.astype(np.ubyte) rows, cols = stim.shape logging.debug("- stim shape={0}x{1}".format(rows, cols)) # shift d_stimBuf in time by 1 frame, from frame i to frame i-1 # write our own memcpy kernel... :-( gdim = (int(iDivUp(nrXY, 128)), 1) bdim = (128, 1, 1) for i in xrange(1, self.nrT): stimBufPt_dst = np.intp(self.d_stimBuf) + self.szXY * (i - 1) stimBufPt_src = np.intp(self.d_stimBuf) + self.szXY * i self.dev_memcpy_dtod(stimBufPt_dst, stimBufPt_src, np.int32(nrXY), block=bdim, grid=gdim) # index into d_stimBuf array to place the new stim at the end # (newest frame at pos: nrT-1) d_stimBufPt = np.intp(self.d_stimBuf) + self.szXY * (self.nrT - 1) # \TODO implement RGB support self.dev_split_gray(d_stimBufPt, cuda.In(stim), np.int32(stim.size), block=bdim, grid=gdim) # create working copy of d_stimBuf cuda.memcpy_dtod(self.d_scalingStimBuf, self.d_stimBuf, self.szXY * self.nrT) # reset V1complex responses to 0 # \FIXME not sure how to use memset...doesn't seem to give expected # result tmp = np.zeros(nrXYD).astype(np.float32) cuda.memcpy_htod(self.d_respV1c, tmp) # allocate d_resp, which will contain the response to all 28 # (nrFilters) space-time orientations at 3 (nrScales) scales for # every pixel location (nrX*nrY) tmp = np.zeros(nrXY * self.nrFilters * self.nrScales).astype( np.float32) cuda.memcpy_htod(self.d_resp, tmp)
def add_initializer(self, var_a, var_b, update_pointers): if var_a in self.params_dict: if var_b in self.internal_states: cuda.memcpy_dtod(self.internal_states[var_b].gpudata, self.params_dict[var_a].gpudata, self.params_dict[var_a].nbytes) if var_b in update_pointers: cuda.memcpy_dtod(int(update_pointers[var_b]), self.params_dict[var_a].gpudata, self.params_dict[var_a].nbytes)
def _loadInput(self, stim): logging.debug('loadInput') # shortcuts nrXY = self.nrX * self.nrY nrXYD = self.nrX * self.nrY * self.nrDirs # parse input assert type(stim).__module__ == "numpy", "stim must be numpy array" assert type(stim).__name__ == "ndarray", "stim must be numpy.ndarray" assert stim.size > 0, "stim cannot be []" stim = stim.astype(np.ubyte) rows, cols = stim.shape logging.debug("- stim shape={0}x{1}".format(rows, cols)) # shift d_stimBuf in time by 1 frame, from frame i to frame i-1 # write our own memcpy kernel... :-( gdim = (int(iDivUp(nrXY, 128)), 1) bdim = (128, 1, 1) for i in xrange(1, self.nrT): stimBufPt_dst = np.intp(self.d_stimBuf) + self.szXY * (i - 1) stimBufPt_src = np.intp(self.d_stimBuf) + self.szXY * i self.dev_memcpy_dtod( stimBufPt_dst, stimBufPt_src, np.int32(nrXY), block=bdim, grid=gdim) # index into d_stimBuf array to place the new stim at the end # (newest frame at pos: nrT-1) d_stimBufPt = np.intp(self.d_stimBuf) + self.szXY * (self.nrT-1) # \TODO implement RGB support self.dev_split_gray( d_stimBufPt, cuda.In(stim), np.int32(stim.size), block=bdim, grid=gdim) # create working copy of d_stimBuf cuda.memcpy_dtod(self.d_scalingStimBuf, self.d_stimBuf, self.szXY*self.nrT) # reset V1complex responses to 0 # \FIXME not sure how to use memset...doesn't seem to give expected # result tmp = np.zeros(nrXYD).astype(np.float32) cuda.memcpy_htod(self.d_respV1c, tmp) # allocate d_resp, which will contain the response to all 28 # (nrFilters) space-time orientations at 3 (nrScales) scales for # every pixel location (nrX*nrY) tmp = np.zeros(nrXY*self.nrFilters*self.nrScales).astype(np.float32) cuda.memcpy_htod(self.d_resp, tmp)
def cache_z(self, z): x = np.require(z.real, dtype=np.double, requirements=['A', 'W', 'O', 'C']) y = np.require(z.imag, dtype=np.double, requirements=['A', 'W', 'O', 'C']) xd = gpuarray.to_gpu(x) yd = gpuarray.to_gpu(y) cuda.memcpy_dtod(self.xd, xd.ptr, xd.nbytes) cuda.memcpy_dtod(self.yd, yd.ptr, yd.nbytes)
def arrayp2g(pary): """convert a PitchArray to a GPUArray""" from pycuda.gpuarray import GPUArray result = GPUArray(pary.shape, pary.dtype) if pary.size: if pary.M == 1: cuda.memcpy_dtod(result.gpudata, pary.gpudata, pary.mem_size * pary.dtype.itemsize) else: PitchTrans(pary.shape, result.gpudata, _pd(result.shape), pary.gpudata, pary.ld, pary.dtype) return result
def _update_buffer(self): if self.my_num_gpot_neurons>0: cuda.memcpy_dtod(int(self.buffer.gpot_buffer.gpudata) + \ self.buffer.gpot_current*self.buffer.gpot_buffer.ld* \ self.buffer.gpot_buffer.dtype.itemsize, self.V.gpudata, \ self.V.nbytes) if self.my_num_spike_neurons>0: cuda.memcpy_dtod(int(self.buffer.spike_buffer.gpudata) + \ self.buffer.spike_current*self.buffer.spike_buffer.ld* \ self.buffer.spike_buffer.dtype.itemsize, self.spike_state.gpudata,\ int(self.spike_state.dtype.itemsize*self.my_num_spike_neurons))
def _set_state(self, k, v): cls = type(self) if k in self.params_dict: cuda.memcpy_dtod(self.states[k].gpudata, self.params_dict[k].gpudata, self.params_dict[k].nbytes) else: if isinstance(v, float): self.states[k].fill(self.floattype(v)) else: assert(v in cls.states) self.states[k].fill(self.floattype(cls.states[v]))
def _update_buffer(self): if self.total_num_gpot_neurons>0: cuda.memcpy_dtod(int(self.buffer.gpot_buffer.gpudata) + self.buffer.gpot_current*self.buffer.gpot_buffer.ld* self.buffer.gpot_buffer.dtype.itemsize, self.V.gpudata, self.V.nbytes) if self.total_num_spike_neurons>0: cuda.memcpy_dtod(int(self.buffer.spike_buffer.gpudata) + self.buffer.spike_current*self.buffer.spike_buffer.ld* self.buffer.spike_buffer.dtype.itemsize, self.spike_state.gpudata, int(self.spike_state.dtype.itemsize*self.total_num_spike_neurons))
def gpuarray_copy(u): """Copes a gpuarray object. Args: u (gpuarray): Input array. Returns: gpuarra: Deep copy of input array. """ v = gpuarray.zeros_like(u) v.strides = u.strides cuda.memcpy_dtod(v.gpudata, u.gpudata, u.nbytes) return v
def update(self): nn, ne, nne = np.int32([self.nn, self.ne, self.nne]) dt, de, vf = np.float64([self.dt, self.de, self.vf]) bs, gs = (256,1,1), (self.nn//256+1,1) ul, ul_prev, ul_tmp = self.ul_gpu, self.ul_prev_gpu, self.ul_tmp_gpu kl = self.kl_gpu el_sum = self.el_sum_gpu c_ul_tmps = np.float32([0, 0.5, 0.5, 1]) c_uls = np.float32([1./6, 1./3, 1./3, 1./6]) cuda.memcpy_dtod(ul_prev, ul, self.ul.nbytes) for c_ul_tmp, c_ul in zip(c_ul_tmps, c_uls): self.update_pre(nn, nne, vf, c_ul_tmp, ul, ul_prev, ul_tmp, kl, el_sum, block=bs, grid=gs) self.update_ul(nn, ne, nne, dt, de, vf, c_ul, ul, ul_tmp, kl, el_sum, block=bs, grid=gs)
def update_other_rest(self, gpot_data, my_num_gpot_neurons, num_virtual_gpot_neurons): if self.num_gpot_neurons > 0: d_other_rest = garray.zeros(num_virtual_gpot_neurons, np.double) a = 0 for data in gpot_data.itervalues(): if len(data) > 0: cuda.memcpy_htod(int(d_other_rest.gpudata) + a , data) a += data.nbytes for i in range(self.gpot_delay_steps): cuda.memcpy_dtod( int(self.gpot_buffer.gpudata) + \ (self.gpot_buffer.ld * i + int(my_num_gpot_neurons)) * \ self.gpot_buffer.dtype.itemsize, d_other_rest.gpudata, \ d_other_rest.nbytes )
def get_cuda_tensor(self, sim, read_img=False): self.batch_renderer.render(sim) self.batch_renderer.map() torch_img = torch.cuda.ByteTensor(self.img_width, self.img_height, 3) torch_pointer = torch_img.data_ptr() render_pointer = self.batch_renderer._cuda_rgb_ptr cuda.memcpy_dtod(torch_pointer, render_pointer, 3 * self.img_width * self.img_height) true_img = None if read_img: true_img = self.batch_renderer.read()[0][0] self.batch_renderer.unmap() return torch_img, true_img
def slice_positions(self): '''Position of the respective slice start within the array self.particle_indices_by_slice . ''' if not hasattr(self, '_slice_positions'): # the last entry of slice_positions needs to be n_slices, # the other entries are the same as lower_bounds self._slice_positions = gpuarray.zeros( self.n_slices + 1, dtype=self.lower_bounds.dtype) self._slice_positions += self.n_slices cuda.memcpy_dtod(self._slice_positions.gpudata, self.lower_bounds.gpudata, self.lower_bounds.nbytes) return self._slice_positions
def stepFunction(): global animIter cuda.memcpy_dtod( plotDataFloat_d.ptr, concentrationOut_d.ptr, concentrationOut_d.nbytes ) maxVal = (gpuarray.max(plotDataFloat_d)).get() multiplyByScalarReal( cudaPre(0.5/(maxVal)), plotDataFloat_d ) floatToUchar( plotDataFloat_d, plotDataChars_d) copyToScreenArray() if cudaP == "float": [ oneIteration_tex() for i in range(nIterationsPerPlot) ] #else: [ oneIteration_sh() for i in range(nIterationsPerPlot//2) ] if plotting and animIter%25 == 0: maxVals.append( maxVal ) sumConc.append( gpuarray.sum(concentrationIn_d).get() ) plotData( maxVals, sumConc ) animIter += 1
def execute(self): if self.recvr_buf is None: # set_ipc_handle must be called before open_ipc_handle in certain cases to avoid a # hang, hence calling set_ in bind_buffers and open_ in execute. # See corresponding comment in ScatterRecv kernel for details. (self.tnsr_ipc_hdl, self.send_ready) = open_ipc_handle( self.op._shared_queues[self.op.idx]) chunk_size = self.tensor.tensor.size * self.op.dtype.itemsize self.recvr_buf = int(self.tnsr_ipc_hdl) + self.op.idx * chunk_size # Push our fragment into its section of the larger recvr buffer, which assumes gather axis # is least contiguous. drv.memcpy_dtod(self.recvr_buf, self.tensor.tensor.gpudata, self.tensor.tensor.size * self.op.dtype.itemsize) drv.memset_d8(self.send_ready, 1, 1)
def _copy_memory_dtod(self, index): # downsample index d_index = int(index / self.rate) # buffer index b_index = d_index % self.buffer_length for key in self.dct.keys(): src = getattr(self.obj, key) dst = int(self.gpu_dct[key].gpudata) + b_index * src.nbytes cuda.memcpy_dtod(dst, src.gpudata, src.nbytes) if (d_index == self.steps - 1) or (b_index == self.buffer_length - 1): for key in self.dct.keys(): buffer = self.get_buffer(key, d_index) cuda.memcpy_dtoh(buffer, self.gpu_dct[key].gpudata)
def stepFunction(): global animIter if showActivity: cuda.memset_d8(activeBlocks_d.ptr, 0, nBlocks ) findActivityKernel( cudaPre(1.e-10), concentrationIn_d, activeBlocks_d, grid=grid2D, block=block2D ) getActivityKernel( activeBlocks_d, activeThreads_d, grid=grid2D, block=block2D ) cuda.memcpy_dtod( plotData_d.ptr, concentrationOut_d.ptr, concentrationOut_d.nbytes ) maxVal = gpuarray.max( plotData_d ).get() scalePlotData(100./maxVal, plotData_d, np.uint8(showActivity), activeThreads_d ) if cudaP == "float": [ oneIteration_tex() for i in range(nIterationsPerPlot) ] else: [ oneIteration_sh() for i in range(nIterationsPerPlot//2) ] if plotting and animIter%25 == 0: maxVals.append( maxVal ) sumConc.append( gpuarray.sum(concentrationIn_d).get() ) plotData( maxVals, sumConc ) animIter += 1
def _get_external_input(self): # use of intermediate I_ext can possibly be avoided input_ext = self.input_generator.next_input() if type(input_ext) == np.ndarray: self.I_ext.set(input_ext) cuda.memcpy_dtod( int(int(self.synapse_state.gpudata) + self.total_synapses*self.synapse_state.dtype.itemsize), int(self.I_ext.gpudata), self.num_input*self.synapse_state.dtype.itemsize) else: cuda.memcpy_dtod( int(int(self.synapse_state.gpudata) + self.total_synapses*self.synapse_state.dtype.itemsize), int(input_ext.gpudata), self.num_input*self.synapse_state.dtype.itemsize)
def _update_buffer(self): """ Update circular buffer of past neuron states. """ if self.total_num_gpot_neurons>0: cuda.memcpy_dtod(int(self.buffer.gpot_buffer.gpudata) + self.buffer.gpot_current*self.buffer.gpot_buffer.ld* self.buffer.gpot_buffer.dtype.itemsize, self.V.gpudata, self.V.dtype.itemsize*self.total_num_gpot_neurons) if self.total_num_spike_neurons>0: cuda.memcpy_dtod(int(self.buffer.spike_buffer.gpudata) + self.buffer.spike_current*self.buffer.spike_buffer.ld* self.buffer.spike_buffer.dtype.itemsize, self.spike_state.gpudata, int(self.spike_state.dtype.itemsize*self.total_num_spike_neurons))
def _assign(self, value): if isinstance(value, (int, float)): # if we have a contiguous array, then use the speedy driver kernel if self.is_contiguous: value = self.dtype.type(value) if self.dtype.itemsize == 1: drv.memset_d8( self.gpudata, unpack_from('B', value)[0], self.size) elif self.dtype.itemsize == 2: drv.memset_d16(self.gpudata, unpack_from('H', value)[0], self.size) else: drv.memset_d32(self.gpudata, unpack_from('I', value)[0], self.size) # otherwise use our copy kerel else: OpTreeNode.build("assign", self, value) elif isinstance(value, GPUTensor): # TODO: add an is_binary_compat like function if self.is_contiguous and value.is_contiguous and self.dtype == value.dtype: drv.memcpy_dtod(self.gpudata, value.gpudata, self.nbytes) else: OpTreeNode.build("assign", self, value) # collapse and execute an op tree as a kernel elif isinstance(value, OpTreeNode): OpTreeNode.build("assign", self, value) # assign to numpy array (same as set()) elif isinstance(value, np.ndarray): self.set(value) else: raise TypeError("Invalid type for assignment: %s" % type(value)) return self
def timeTransition(): global realDynamics, alpha, applyTransition realDynamics = not realDynamics applyTransition = False if realDynamics: cuda.memcpy_dtod(psiK2_d.ptr, psi_d.ptr, psi_d.nbytes) cuda.memcpy_dtod(psiRunge_d.ptr, psi_d.ptr, psi_d.nbytes) if realTEXTURE: copy3DpsiK1Real() copy3DpsiK1Imag() copy3DpsiK2Real() copy3DpsiK2Imag() print "Real Dynamics" else: #GetAlphas getAlphas( dx, dy, dz, xMin, yMin, zMin, gammaX, gammaY, gammaZ, psi_d, alphas_d, block = block3D, grid=grid3D) alpha= cudaPre( ( 0.5*(gpuarray.max(alphas_d) + gpuarray.min(alphas_d)) ).get() ) #OPTIMIZACION print "Imaginary Dynamics"
def _read_external_input(self): if self.input_eof: return cuda.memcpy_dtod(int(int(self.synapse_state.gpudata) + \ self.total_synapses*self.synapse_state.dtype.itemsize), \ int(int(self.I_ext.gpudata) + self.frame_count*self.I_ext.ld*self.I_ext.dtype.itemsize), \ self.num_input * self.synapse_state.dtype.itemsize) self.frame_count += 1 if self.frame_count >= self.one_time_import: h_ext = self.input_h5file.root.array.read(self.file_pointer, self.file_pointer + self.one_time_import) if h_ext.shape[0] == self.I_ext.shape[0]: self.I_ext.set(h_ext) self.file_pointer += self.one_time_import self.frame_count = 0 else: if self.file_pointer == self.input_h5file.root.array.shape[0]: self.logger.info('Input end of file reached. Behaviour is ' +\ 'undefined for subsequent steps') self.input_eof = True
def cuTranspose_permute_ept1(a_d, b_d, permutation): N = b_d.shape[0] if permutation == (2, 1, 0): #210 permute_210_ept1.prepared_call((N/32, N/32, N), (32, 32, 1), b_d.gpudata, a_d.gpudata, N, N, N) elif permutation == (0, 2, 1): #102 permute_102_ept1.prepared_call((N/32, N/32, N), (32, 32, 1), b_d.gpudata, a_d.gpudata, N, N, N) elif permutation == (1, 0, 2): #021 permute_021_ept1.prepared_call((N/32, N/32, N), (32, 32, 1), b_d.gpudata, a_d.gpudata, N, N, N) elif permutation == (1, 2, 0): #201 permute_201_ept1.prepared_call((N/32, N/32, N), (32, 32, 1), b_d.gpudata, a_d.gpudata, N, N, N) elif permutation == (2, 0, 1): #120 permute_120_ept1.prepared_call((N/32, N/32, N), (32, 32, 1), b_d.gpudata, a_d.gpudata, N, N, N) elif permutation == (0, 1, 2): cuda.memcpy_dtod(b_d.gpudata, a_d.gpudata, b_d.nbytes)
def DT_GPU(self, X, c): DIM = X.size floatSize = X.dtype.itemsize q = gpuarray.empty(DIM / 2, X.dtype) p = gpuarray.empty(DIM / 2, X.dtype) XNext = gpuarray.empty(DIM, X.dtype) cuda.memcpy_dtod(q.ptr, X.ptr, floatSize * DIM / 2) cuda.memcpy_dtod(p.ptr, X.ptr + floatSize * DIM / 2, floatSize * DIM / 2) qNext = q + c * self.dt * self.dTdp(p) pNext = p cuda.memcpy_dtod(XNext.ptr, qNext.ptr, floatSize * DIM / 2) cuda.memcpy_dtod(XNext.ptr + floatSize * DIM / 2, pNext.ptr, floatSize * DIM / 2) return XNext
def DV_GPU(self, X, d): DIM = X.size floatSize = X.dtype.itemsize q = gpuarray.empty(DIM / 2, X.dtype) p = gpuarray.empty(DIM / 2, X.dtype) XNext = gpuarray.empty(DIM, X.dtype) cuda.memcpy_dtod(q.ptr, X.ptr, floatSize * DIM / 2) cuda.memcpy_dtod(p.ptr, X.ptr + floatSize * DIM / 2, floatSize * DIM / 2) qNext = q pNext = p - d * self.dt * self.dVdq(q) cuda.memcpy_dtod(XNext.ptr, qNext.ptr, floatSize * DIM / 2) cuda.memcpy_dtod(XNext.ptr + floatSize * DIM / 2, pNext.ptr, floatSize * DIM / 2) return XNext
def __setitem__(self, key, other): # if args is [:] then assign `other` to the entire ndarray if key == slice(None): if isinstance(other, pycuda.gpuarray.GPUArray): if self.flags.forc and other.flags.forc: # both arrays are a contiguous block of memory cuda.memcpy_dtod(self.gpudata, other.gpudata, self.nbytes) return else: if self.flags.forc: # both arrays are a contiguous block of memory cuda.memcpy_htod(self.gpudata, other) return copy_non_contiguous(self, other) return # assign `other` to sub-array of self sub_array = self[key] sub_array[:] = other
def make_pitcharray(dptr, shape, dtype, linear = False, pitch=None): """ create a PitchArray from a DeviceAllocation pointer linear: "True" indicates the device memory is a linearly allocated "False" indicates the device memory is allocated by cudaMallocPitch, and pitch must be provided """ if linear: result = PitchArray(shape, dtype) if result.size: if result.M == 1: cuda.memcpy_dtod(result.gpudata, dptr, result.mem_size * dtype.itemsize) else: PitchTrans(shape, result.gpudata, result.ld, dptr, _pd(shape), dtype) else: result = PitchArray(shape, dtype, gpudata=dptr, pitch = pitch) return result
def copy_rows(self, start, stop, step = 1): nrows = len(range(start,stop,step)) if nrows: if self.ndim == 2: shape = (nrows, self.shape[1]) else: shape = (nrows, self.shape[1], self.shape[2]) else: if self.ndim == 2: shape = (nrows, 0) else: shape = (nrows, 0, 0) result = PitchArray(shape, self.dtype) if nrows > 1: PitchTrans(shape, result.gpudata, result.ld, int(self.gpudata) + start * self.ld * self.dtype.itemsize, self.ld * step, self.dtype) elif nrows == 1: cuda.memcpy_dtod(result.gpudata, int(self.gpudata) + start * self.ld * self.dtype.itemsize, self.dtype.itemsize * _pd(shape)) return result
def matrix_addition(d_a, d_b): # Overwrites d_a assert d_a.shape == d_b.shape if len(d_a.shape) == 1: # Vector addition cublas.cublasSaxpy(handle, d_a.size, 1.0, d_b.gpudata, 1, d_a.gpudata, 1) elif len(d_a.shape) == 2: # Matrix addition m, n = d_a.shape cublas.cublasSgeam(handle, 'N', 'N', m, n, 1.0, d_a.gpudata, m, 1.0, d_b.gpudata, m, d_a.gpudata, m) else: tmp = (d_a.ravel() + d_b.ravel()).reshape(d_a.shape) cuda.memcpy_dtod(d_a.gpudata, tmp.gpudata, d_a.nbytes) return d_a
def wave_cu( self , dt ) : mpos = cuda_gl.BufferObject( long( self.gpos ) ) dpos = mpos.map() # self._debug_print() self.collision.prepared_call( self.grid , self.block , self.df1 , self.BOX , self.BOX , self.BOX ) # self._debug_print() self.streaming.prepared_call( self.grid , self.block , self.df1 , self.df2 , self.BOX , self.BOX , self.BOX ) cuda_driver.memcpy_dtod( self.df1 , self.df2 , self.f.nbytes ) self.colors.prepared_call( self.grid , self.block , dpos.device_ptr() , self.df1 , self.BOX , self.BOX , self.BOX ) dpos.unmap() mpos.unregister()