def evaluate(self, params, returnOutputs=False): """Evaluate several networks (with given params) on training set. @param params: network params @type params: list of Parameters @param returnOutputs: return network output values (debug) @type returnOutputs: bool, default False @return output matrix if returnOutputs=True, else None """ if self.popSize != len(params): raise ValueError("Need %d Parameter structures (provided %d)" % ( self.popSize, len(params))) paramArrayType = Parameters * len(params) driver.memcpy_htod(self.params, paramArrayType(*params)) # TODO: remove driver.memset_d8(self.outputs, 0, self.popSize * self.trainSet.size * 4) self.evaluateKernel.prepared_call(self.evaluateGridDim, self.trainSetDev, self.trainSet.size, self.params, self.popSize, self.outputs) driver.Context.synchronize() self.outputsMat = driver.from_device(self.outputs, shape=(self.popSize, self.trainSet.size), dtype=np.float32) if returnOutputs: return self.outputsMat
def add_ancilla(self, anc_st): """Add an ancilla in the ground or excited state as the highest new bit. """ byte_size_of_smaller_dm = 2**(2 * self.no_qubits) * 8 if self.allocated_qubits == self.no_qubits: # allocate larger memory new_dm = ga.zeros(self._size * 4, np.float64) offset = anc_st * 3 * byte_size_of_smaller_dm drv.memcpy_dtod(int(new_dm.gpudata) + offset, self.data.gpudata, byte_size_of_smaller_dm) self.data = new_dm else: # reuse previously allocated memory if anc_st == 0: drv.memset_d8(int(self.data.gpudata) + byte_size_of_smaller_dm, 0, 3 * byte_size_of_smaller_dm) if anc_st == 1: drv.memcpy_dtod(int(self.data.gpudata) + 3 * byte_size_of_smaller_dm, self.data.gpudata, byte_size_of_smaller_dm) drv.memset_d8(self.data.gpudata, 0, 3 * byte_size_of_smaller_dm) self._set_no_qubits(self.no_qubits + 1)
def stepFuntion(): getModulo( psi_d, psiMod_d ) maxVal = (gpuarray.max(psiMod_d)).get() multiplyByScalarReal( cudaPre(0.95/(maxVal)), psiMod_d ) sendModuloToUCHAR( psiMod_d, plotData_d) copyToScreenArray() if volumeRender.nTextures == 2: if not realDynamics: cuda.memset_d8(activity_d.ptr, 0, nBlocks3D ) findActivityKernel( cudaPre(0.001), psi_d, activity_d, grid=grid3D, block=block3D ) if plotVar == 1: getActivityKernel( psiOther_d, activity_d, grid=grid3D, block=block3D ) if plotVar == 0: if realTEXTURE: tex_psiReal.set_array( psiK2Real_array ) tex_psiImag.set_array( psiK2Imag_array ) getVelocity_texKernel( dx, dy, dz, psi_d, activity_d, psiOther_d, grid=grid3D, block=block3D ) else: getVelocityKernel( np.int32(neighbors), dx, dy, dz, psi_d, activity_d, psiOther_d, grid=grid3D, block=block3D ) maxVal = (gpuarray.max(psiOther_d)).get() if maxVal > 0: multiplyByScalarReal( cudaPre(1./maxVal), psiOther_d ) sendModuloToUCHAR( psiOther_d, plotData_d_1) copyToScreenArray_1() if applyTransition: timeTransition() if realDynamics: realStep() else: imaginaryStep()
def prepare_dest_package_and_dest_devptr(new_data, dest_package): # ??????????????????? output_package = dest_package.copy() if new_data: # create new data # because we don't have cuda memory allocation for dest_package dest_devptr, new_usage = malloc_with_swap_out(output_package.data_bytes) output_package.set_usage(new_usage) cuda.memset_d8(dest_devptr, 0, output_package.data_bytes) else: # we already have cuda memory allocation # if there are enough halo, we can use exist buffer instead dest_package # if there are not enough halo, we have to allocate new buffer new_data_halo = task.dest.data_halo exist_data_halo = data_list[u][ss][sp].data_halo if new_data_halo <= exist_data_halo: output_package = data_list[u][ss][sp] else: output_package = dest_package dest_devptr = data_list[u][ss][sp].devptr return output_package, dest_devptr
def rk4_iteration(): cuda.memset_d8(activity_d.ptr, 0, nBlocks3D ) findActivityKernel( cudaPre(0.00001), psi_d, activity_d, grid=grid3D, block=block3D ) #Step 1 slopeCoef = cudaPre( 1.0 ) weight = cudaPre( 0.5 ) eulerStepKernel( np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), slopeCoef, weight, xMin, yMin, zMin, dx, dy, dz, dtReal, gammaX, gammaY, gammaZ, omega, psi_d, psiK2_d, psiK1_d, psiRunge_d, np.uint8(0), activity_d, grid=grid3D, block=block3D ) #Step 2 slopeCoef = cudaPre( 2.0 ) weight = cudaPre( 0.5 ) eulerStepKernel( np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), slopeCoef, weight, xMin, yMin, zMin, dx, dy, dz, dtReal, gammaX, gammaY, gammaZ, omega, psi_d, psiK1_d, psiK2_d, psiRunge_d, np.uint8(0), activity_d, grid=grid3D, block=block3D ) #Step 3 slopeCoef = cudaPre( 2.0 ) weight = cudaPre( 1. ) eulerStepKernel( np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), slopeCoef, weight, xMin, yMin, zMin, dx, dy, dz, dtReal, gammaX, gammaY, gammaZ, omega, psi_d, psiK2_d, psiK1_d, psiRunge_d, np.uint8(0), activity_d, grid=grid3D, block=block3D ) #Step 4 slopeCoef = cudaPre( 1.0 ) weight = cudaPre( 1. ) eulerStepKernel( np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), slopeCoef, weight, xMin, yMin, zMin, dx, dy, dz, dtReal, gammaX, gammaY, gammaZ, omega, psi_d, psiK1_d, psiK2_d, psiRunge_d, np.uint8(1), activity_d, grid=grid3D, block=block3D )
def test_reshuffle_invertible(self): dm = random_dm10() dm_gpu = drv.to_device(dm) for i in range(no_qubits): bit_to_pauli_basis(dm_gpu, np.int32(1 << i), np.int32(no_qubits), block=block, grid=grid) dmreal = np.zeros(2**(2 * no_qubits)) dmreal_gpu = drv.to_device(dmreal) pauli_reshuffle(dm_gpu, dmreal_gpu, np.int32(no_qubits), np.int32(0), block=block, grid=grid) dm_gpu2 = drv.mem_alloc(dm.nbytes) drv.memset_d8(dm_gpu2, 0, dm.nbytes) pauli_reshuffle(dm_gpu2, dmreal_gpu, np.int32(no_qubits), np.int32(1), block=block, grid=grid) for i in range(no_qubits): bit_to_pauli_basis(dm_gpu2, np.int32(1 << i), np.int32(no_qubits), block=block, grid=grid) dm2 = drv.from_device_like(dm_gpu2, dm) assert np.allclose(dm, dm2)
def set_ipc_handle(op, shared_queue, handle): lock = drv.mem_alloc(1) drv.memset_d8(lock, 0, 1) buf_ipc_hdl = drv.mem_get_ipc_handle(handle) lock_ipc_hdl = drv.mem_get_ipc_handle(lock) shared_queue.put((buf_ipc_hdl, lock_ipc_hdl)) return (lock)
def execute(self): sender_ready = drv.from_device(self.sender_ready, (1, ), np.int8) while (sender_ready == 0): sender_ready = drv.from_device(self.sender_ready, (1, ), np.int8) drv.memcpy_dtod(self.tensor.tensor.gpudata, self.sender_buf, self.tensor.tensor.size * self.op.dtype.itemsize) drv.memset_d8(self.sender_ready, 0, 1)
def stepFunction(): global animIter if showActivity: cuda.memset_d8(activeBlocks_d.ptr, 0, nBlocks) findActivityKernel(cudaPre(1.e-10), concentrationIn_d, activeBlocks_d, grid=grid2D, block=block2D) getActivityKernel(activeBlocks_d, activeThreads_d, grid=grid2D, block=block2D) cuda.memcpy_dtod(plotData_d.ptr, concentrationOut_d.ptr, concentrationOut_d.nbytes) maxVal = gpuarray.max(plotData_d).get() scalePlotData(100. / maxVal, plotData_d, np.uint8(showActivity), activeThreads_d) if cudaP == "float": [oneIteration_tex() for i in range(nIterationsPerPlot)] else: [oneIteration_sh() for i in range(nIterationsPerPlot // 2)] if plotting and animIter % 25 == 0: maxVals.append(maxVal) sumConc.append(gpuarray.sum(concentrationIn_d).get()) plotData(maxVals, sumConc) animIter += 1
def execute(self): for i in range(len(self.op.from_id)): sender_ready = drv.from_device(self.sender_ready[i], (1, ), np.int8) while (sender_ready == 0): sender_ready = drv.from_device(self.sender_ready[i], (1, ), np.int8) drv.memset_d8(self.sender_ready[i], 0, 1)
def set_ipc_handle(op, shared_queue, handle, local=False): lock = drv.mem_alloc(1) drv.memset_d8(lock, 0, 1) if local: buf_ipc_hdl = int(handle) lock_ipc_hdl = int(lock) else: buf_ipc_hdl = drv.mem_get_ipc_handle(handle) lock_ipc_hdl = drv.mem_get_ipc_handle(lock) shared_queue.put((local, buf_ipc_hdl, lock_ipc_hdl)) return (lock)
def execute(positions, num_particles, num_frames): #Get host positions: cpuPos = numpy.array(positions, dtype=numpy.float32) #Allocate position space on device: devPos = cuda.mem_alloc(cpuPos.nbytes) #Copy positions: cuda.memcpy_htod(devPos, cpuPos) #Allocate device velocities: devVels = cuda.mem_alloc(2 * num_particles * numpy.float32().nbytes) cuda.memset_d32(devVels, 0, 2 * num_particles) # #Copy velocities: # cuda.memcpy_htod(devVels, cpuVels) #Allocate and initialize device in bounds to false: #inBounds = numpy.zeros(num_particles, dtype=bool) devInBounds = cuda.mem_alloc(num_particles * numpy.bool8().nbytes) cuda.memset_d8(devInBounds, True, num_particles) # inB = numpy.zeros(num_particles, dtype=numpy.bool) # cuda.memcpy_dtoh(inB, devInBounds) # print inB # cuda.memcpy_htod(devInBounds, inBounds) # numBlocks = 1#(num_particles // 512) + 1; grid_dim = ((num_particles // NUM_THREADS) + 1, 1) print grid_dim runframe = module.get_function("runframe") frames = [None] * num_frames for i in range(num_frames): runframe(devPos, devVels, devInBounds, numpy.int32(num_particles), grid=grid_dim, block=(NUM_THREADS, 1, 1)) #Get the positions from device: cuda.memcpy_dtoh(cpuPos, devPos) frames[i] = cpuPos.copy() #frames[i] = copy(cpuPos) #write_frame(out, cpuPos, num_particles) #Simulation destination file: # out = open(OUTPUT_FILE, 'w') # write_header(out, num_particles) # for frame in frames: # write_frame(out, frame, num_particles) #clean up... #out.close() devPos.free() devVels.free() devInBounds.free()
def rk4_FFT_iteration(): cuda.memset_d8(activity_d.ptr, 0, nBlocks3D ) findActivityKernel( cudaPre(0.00001), psi_d, activity_d, grid=grid3D, block=block3D ) #Step 1 slopeCoef = cudaPre( 1.0 ) weight = cudaPre( 0.5 ) fftPlan.execute( psiK2_d, psiFFT_d ) getFFTderivatives( Lx, Ly, Lz, psiFFT_d, fftKx_d, fftKy_d, fftKz_d, partialX_d, partialY_d, laplacian_d, grid=grid3D, block=block3D ) fftPlan.execute( partialX_d, inverse=True ) fftPlan.execute( partialY_d, inverse=True ) fftPlan.execute( laplacian_d, inverse=True ) eulerStep_FFTKernel( np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), slopeCoef, weight, xMin, yMin, zMin, dx, dy, dz, dtReal, gammaX, gammaY, gammaZ, x0, y0, omega, psi_d, psiK2_d, psiK1_d, psiRunge_d, laplacian_d, partialX_d, partialY_d, np.uint8(0), activity_d, grid=grid3D, block=block3D ) #Step 2 slopeCoef = cudaPre( 2.0 ) weight = cudaPre( 0.5 ) fftPlan.execute( psiK1_d, psiFFT_d ) getFFTderivatives( Lx, Ly, Lz, psiFFT_d, fftKx_d, fftKy_d, fftKz_d, partialX_d, partialY_d, laplacian_d, grid=grid3D, block=block3D ) fftPlan.execute( partialX_d, inverse=True ) fftPlan.execute( partialY_d, inverse=True ) fftPlan.execute( laplacian_d, inverse=True ) eulerStep_FFTKernel( np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), slopeCoef, weight, xMin, yMin, zMin, dx, dy, dz, dtReal, gammaX, gammaY, gammaZ, x0, y0, omega, psi_d, psiK1_d, psiK2_d, psiRunge_d, laplacian_d, partialX_d, partialY_d, np.uint8(0), activity_d, grid=grid3D, block=block3D ) #Step 3 slopeCoef = cudaPre( 2.0 ) weight = cudaPre( 1. ) fftPlan.execute( psiK2_d, psiFFT_d ) getFFTderivatives( Lx, Ly, Lz, psiFFT_d, fftKx_d, fftKy_d, fftKz_d, partialX_d, partialY_d, laplacian_d, grid=grid3D, block=block3D ) fftPlan.execute( partialX_d, inverse=True ) fftPlan.execute( partialY_d, inverse=True ) fftPlan.execute( laplacian_d, inverse=True ) eulerStep_FFTKernel( np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), slopeCoef, weight, xMin, yMin, zMin, dx, dy, dz, dtReal, gammaX, gammaY, gammaZ, x0, y0, omega, psi_d, psiK2_d, psiK1_d, psiRunge_d, laplacian_d, partialX_d, partialY_d, np.uint8(0), activity_d, grid=grid3D, block=block3D ) #Step 4 slopeCoef = cudaPre( 1.0 ) weight = cudaPre( 1. ) fftPlan.execute( psiK1_d, psiFFT_d ) getFFTderivatives( Lx, Ly, Lz, psiFFT_d, fftKx_d, fftKy_d, fftKz_d, partialX_d, partialY_d, laplacian_d, grid=grid3D, block=block3D ) fftPlan.execute( partialX_d, inverse=True ) fftPlan.execute( partialY_d, inverse=True ) fftPlan.execute( laplacian_d, inverse=True ) eulerStep_FFTKernel( np.int32(nWidth), np.int32(nHeight), np.int32(nDepth), slopeCoef, weight, xMin, yMin, zMin, dx, dy, dz, dtReal, gammaX, gammaY, gammaZ, x0, y0, omega, psi_d, psiK1_d, psiK2_d, psiRunge_d, laplacian_d, partialX_d, partialY_d, np.uint8(1), activity_d, grid=grid3D, block=block3D )
def memset(self, allocation, value, size): """set the memory in allocation to the value in value :param allocation: A GPU memory allocation unit :type allocation: pycuda.driver.DeviceAllocation :param value: The value to set the memory to :type value: a single 8-bit unsigned int :param size: The size of to the allocation unit in bytes :type size: int """ drv.memset_d8(allocation, value, size)
def execute(self): if self.recvr_buf is None: # set_ipc_handle must be called before open_ipc_handle in certain cases to avoid a # hang, hence calling set_ in bind_buffers and open_ in execute. # See corresponding comment in ScatterRecv kernel for details. (self.tnsr_ipc_hdl, self.send_ready) = open_ipc_handle( self.op._shared_queues[self.op.idx]) chunk_size = self.tensor.tensor.size * self.op.dtype.itemsize self.recvr_buf = int(self.tnsr_ipc_hdl) + self.op.idx * chunk_size # Push our fragment into its section of the larger recvr buffer, which assumes gather axis # is least contiguous. drv.memcpy_dtod(self.recvr_buf, self.tensor.tensor.gpudata, self.tensor.tensor.size * self.op.dtype.itemsize) drv.memset_d8(self.send_ready, 1, 1)
def stepFunction(): global animIter if showActivity: cuda.memset_d8(activeBlocks_d.ptr, 0, nBlocks ) findActivityKernel( cudaPre(1.e-10), concentrationIn_d, activeBlocks_d, grid=grid2D, block=block2D ) getActivityKernel( activeBlocks_d, activeThreads_d, grid=grid2D, block=block2D ) cuda.memcpy_dtod( plotData_d.ptr, concentrationOut_d.ptr, concentrationOut_d.nbytes ) maxVal = gpuarray.max( plotData_d ).get() scalePlotData(100./maxVal, plotData_d, np.uint8(showActivity), activeThreads_d ) if cudaP == "float": [ oneIteration_tex() for i in range(nIterationsPerPlot) ] else: [ oneIteration_sh() for i in range(nIterationsPerPlot//2) ] if plotting and animIter%25 == 0: maxVals.append( maxVal ) sumConc.append( gpuarray.sum(concentrationIn_d).get() ) plotData( maxVals, sumConc ) animIter += 1
def prepare_dest_package_and_dest_devptr(new_data, dest_package): output_package = dest_package.copy() if new_data: # create new data dest_devptr, new_usage = malloc_with_swap_out(output_package.buffer_bytes) output_package.usage = new_usage cuda.memset_d8(dest_devptr, 0, output_package.buffer_bytes) else: new_data_halo = task.dest.data_halo exist_data_halo = data_list[u][ss][sp] if new_data_halo < exist_data_halo: output_package = data_list[u][ss][sp] else: output_package = dest_package dest_devptr = data_list[u][ss][sp].devptr return output_package, dest_devptr
def _assign(self, value): if isinstance(value, (int, float)): # if we have a contiguous array, then use the speedy driver kernel if self.is_contiguous: value = self.dtype.type(value) if self.dtype.itemsize == 1: drv.memset_d8( self.gpudata, unpack_from('B', value)[0], self.size) elif self.dtype.itemsize == 2: drv.memset_d16(self.gpudata, unpack_from('H', value)[0], self.size) else: drv.memset_d32(self.gpudata, unpack_from('I', value)[0], self.size) # otherwise use our copy kerel else: OpTreeNode.build("assign", self, value) elif isinstance(value, GPUTensor): # TODO: add an is_binary_compat like function if self.is_contiguous and value.is_contiguous and self.dtype == value.dtype: drv.memcpy_dtod(self.gpudata, value.gpudata, self.nbytes) else: OpTreeNode.build("assign", self, value) # collapse and execute an op tree as a kernel elif isinstance(value, OpTreeNode): OpTreeNode.build("assign", self, value) # assign to numpy array (same as set()) elif isinstance(value, np.ndarray): self.set(value) else: raise TypeError("Invalid type for assignment: %s" % type(value)) return self
def rk4_texture_iteration(): cuda.memset_d8(activity_d.ptr, 0, nBlocks3D ) findActivityKernel( cudaPre(0.00001), psi_d, activity_d, grid=grid3D, block=block3D ) #Step 1 slopeCoef = cudaPre( 1.0 ) weight = cudaPre( 0.5 ) tex_psiReal.set_array( psiK2Real_array ) tex_psiImag.set_array( psiK2Imag_array ) surf_psiReal.set_array( psiK1Real_array ) surf_psiImag.set_array( psiK1Imag_array ) eulerStep_textKernel( slopeCoef, weight, xMin, yMin, zMin, dx, dy, dz, dtReal, gammaX, gammaY, gammaZ, omega, psi_d, psiRunge_d, np.uint8(0), activity_d, grid=grid3D, block=block3D ) #Step 2 slopeCoef = cudaPre( 2.0 ) weight = cudaPre( 0.5 ) tex_psiReal.set_array( psiK1Real_array ) tex_psiImag.set_array( psiK1Imag_array ) surf_psiReal.set_array( psiK2Real_array ) surf_psiImag.set_array( psiK2Imag_array ) eulerStep_textKernel( slopeCoef, weight, xMin, yMin, zMin, dx, dy, dz, dtReal, gammaX, gammaY, gammaZ, omega, psi_d, psiRunge_d, np.uint8(0), activity_d, grid=grid3D, block=block3D ) #Step 3 slopeCoef = cudaPre( 2.0 ) weight = cudaPre( 1. ) tex_psiReal.set_array( psiK2Real_array ) tex_psiImag.set_array( psiK2Imag_array ) surf_psiReal.set_array( psiK1Real_array ) surf_psiImag.set_array( psiK1Imag_array ) eulerStep_textKernel( slopeCoef, weight, xMin, yMin, zMin, dx, dy, dz, dtReal, gammaX, gammaY, gammaZ, omega, psi_d, psiRunge_d, np.uint8(0), activity_d, grid=grid3D, block=block3D ) #Step 4 slopeCoef = cudaPre( 1.0 ) weight = cudaPre( 1. ) tex_psiReal.set_array( psiK1Real_array ) tex_psiImag.set_array( psiK1Imag_array ) surf_psiReal.set_array( psiK2Real_array ) surf_psiImag.set_array( psiK2Imag_array ) eulerStep_textKernel( slopeCoef, weight, xMin, yMin, zMin, dx, dy, dz, dtReal, gammaX, gammaY, gammaZ, omega, psi_d, psiRunge_d, np.uint8(1), activity_d, grid=grid3D, block=block3D )
def _assign(self, value): if isinstance(value, (int, float)): # if we have a contiguous array, then use the speedy driver kernel if self.is_contiguous: value = self.dtype.type(value) if self.dtype.itemsize == 1: drv.memset_d8(self.gpudata, unpack_from('B', value)[0], self.size) elif self.dtype.itemsize == 2: drv.memset_d16(self.gpudata, unpack_from('H', value)[0], self.size) else: drv.memset_d32(self.gpudata, unpack_from('I', value)[0], self.size) # otherwise use our copy kerel else: OpTreeNode.build("assign", self, value) elif isinstance(value, GPUTensor): # TODO: add an is_binary_compat like function if self.is_contiguous and value.is_contiguous and self.dtype == value.dtype: drv.memcpy_dtod(self.gpudata, value.gpudata, self.nbytes) else: OpTreeNode.build("assign", self, value) # collapse and execute an op tree as a kernel elif isinstance(value, OpTreeNode): OpTreeNode.build("assign", self, value) # assign to numpy array (same as set()) elif isinstance(value, np.ndarray): self.set(value) else: raise TypeError("Invalid type for assignment: %s" % type(value)) return self
def prepare_dest_package_and_dest_devptr(new_data, dest_package): output_package = dest_package.copy() if new_data: # create new data dest_devptr, new_usage = malloc_with_swap_out( output_package.buffer_bytes) output_package.usage = new_usage cuda.memset_d8(dest_devptr, 0, output_package.buffer_bytes) else: new_data_halo = task.dest.data_halo exist_data_halo = data_list[u][ss][sp] if new_data_halo < exist_data_halo: output_package = data_list[u][ss][sp] else: output_package = dest_package dest_devptr = data_list[u][ss][sp].devptr return output_package, dest_devptr
def step_stage1(self): # Copy data to GPU memory cuda.memcpy_htod(self.mass_rx_array_g, self.mass_r_array[:, 0]) cuda.memcpy_htod(self.mass_ry_array_g, self.mass_r_array[:, 1]) cuda.memcpy_htod(self.mass_rz_array_g, self.mass_r_array[:, 2]) cuda.memset_d8(self.mass_ax_array_g, 0, self.MEM_LEN) cuda.memset_d8(self.mass_ay_array_g, 0, self.MEM_LEN) cuda.memset_d8(self.mass_az_array_g, 0, self.MEM_LEN) # Run "pair" calculation: One object against vector of objects per iteration for row_np, threads_per_block, blocks in self.index_range: self.sm_update_pair(row_np, self.MASS_LEN_np, self.mass_rx_array_g, self.mass_ry_array_g, self.mass_rz_array_g, self.mass_ax_array_g, self.mass_ay_array_g, self.mass_az_array_g, self.mass_m_array_g, block=(threads_per_block, 1, 1), grid=(blocks, 1)) # Copy data to GPU memory cuda.memcpy_dtoh(self.mass_a_array[:, 0], self.mass_ax_array_g) cuda.memcpy_dtoh(self.mass_a_array[:, 1], self.mass_ay_array_g) cuda.memcpy_dtoh(self.mass_a_array[:, 2], self.mass_az_array_g)
def execute(self): # Push our fragment into its section of the larger recvr buffer, which assumes gather axis # is least contiguous. drv.memcpy_dtod(self.recvr_buf, self.tensor.tensor.gpudata, self.tensor.tensor.size * self.op.dtype.itemsize) drv.memset_d8(self.send_ready, 1, 1)
def func(): drv.memset_d8(devU, 0, cpuU.nbytes) kernel.prepared_call(grid, block, *parms)
def execute(self): for i in range(len(self.op.to_id)): drv.memset_d8(self.send_ready[i], 1, 1)