def _from_device(self): flag = self._data.flags['WRITEABLE'] maybe_setflags(self._data, write=True) if self.state is DeviceDataMixin.DEVICE: self._device_data.get(_queue, self._data) self._data = self._maybe_to_aos(self._data) self.state = DeviceDataMixin.BOTH maybe_setflags(self._data, write=flag)
def _compute(self, part): conf = self.launch_configuration() if self._is_indirect: _plan = Plan( part, *self._unwound_args, partition_size=conf["partition_size"], matrix_coloring=self._requires_matrix_coloring ) conf["local_memory_size"] = _plan.nshared conf["ninds"] = _plan.ninds conf["work_group_size"] = min(_max_work_group_size, conf["partition_size"]) conf["work_group_count"] = _plan.nblocks conf["warpsize"] = _warpsize conf["op2stride"] = self._it_space.size fun = JITModule(self.kernel, self.it_space, *self.args, parloop=self, conf=conf) args = [] for arg in self._unique_args: arg.data._allocate_device() if arg.access is not device.WRITE: arg.data._to_device() for a in self._unique_dat_args: args.append(a.data.array.data) for a in self._all_global_non_reduction_args: args.append(a.data._array.data) for a in self._all_global_reduction_args: a.data._allocate_reduction_array(conf["work_group_count"]) args.append(a.data._d_reduc_array.data) for cst in Const._definitions(): args.append(cst._array.data) for m in self._unique_matrix: args.append(m._dev_array.data) m._to_device() args.append(m._rowptr.data) args.append(m._colidx.data) for m in self._matrix_entry_maps: m._to_device() args.append(m._device_values.data) if self._is_direct: args.append(np.int32(part.size)) args.append(np.int32(part.offset)) fun(conf["thread_count"], conf["work_group_size"], *args) else: args.append(np.int32(part.size)) args.append(np.int32(part.offset)) args.append(_plan.ind_map.data) args.append(_plan.loc_map.data) args.append(_plan.ind_sizes.data) args.append(_plan.ind_offs.data) args.append(_plan.blkmap.data) args.append(_plan.offset.data) args.append(_plan.nelems.data) args.append(_plan.nthrcol.data) args.append(_plan.thrcol.data) block_offset = 0 args.append(0) for i in range(_plan.ncolors): blocks_per_grid = int(_plan.ncolblk[i]) threads_per_block = min(_max_work_group_size, conf["partition_size"]) thread_count = threads_per_block * blocks_per_grid args[-1] = np.int32(block_offset) fun(int(thread_count), int(threads_per_block), *args) block_offset += blocks_per_grid # mark !READ data as dirty for arg in self.args: if arg.access is not READ: arg.data.state = DeviceDataMixin.DEVICE if arg._is_dat: maybe_setflags(arg.data._data, write=False) for a in self._all_global_reduction_args: a.data._post_kernel_reduction_task(conf["work_group_count"], a.access)
def compute(self): if self._has_soa: op2stride = Const(1, self._it_space.size, name='op2stride', dtype='int32') arglist = [np.int32(self._it_space.size)] config = self.launch_configuration() fun = JITModule(self.kernel, self.it_space.extents, *self.args, parloop=self, config=config) if self._is_direct: _args = self.args block_size = config['block_size'] max_grid_size = config['grid_size'] shared_size = config['required_smem'] else: _args = self._unique_args maxbytes = sum([a.dtype.itemsize * a.data.cdim \ for a in self._unwound_args if a._is_indirect]) # shared memory as reported by the device, divided by some # factor. This is the same calculation as done inside # op_plan_core, but without assuming 48K shared memory. # It would be much nicer if we could tell op_plan_core "I # have X bytes shared memory" part_size = (_AVAILABLE_SHARED_MEMORY / (64 * maxbytes)) * 64 self._plan = Plan(self.kernel, self._it_space.iterset, *self._unwound_args, partition_size=part_size) max_grid_size = self._plan.ncolblk.max() for arg in _args: if arg._is_mat: d = arg.data._lmadata.gpudata offset = arg.data._lmaoffset(self._it_space.iterset) arglist.append(np.intp(d)) arglist.append(np.int32(offset)) else: arg.data._allocate_device() if arg.access is not op2.WRITE: arg.data._to_device() karg = arg.data._device_data if arg._is_global_reduction: arg.data._allocate_reduction_buffer(max_grid_size, arg.access) karg = arg.data._reduction_buffer arglist.append(np.intp(karg.gpudata)) if self._is_direct: _stream.synchronize() fun(max_grid_size, block_size, _stream, *arglist, shared_size=shared_size) for arg in self.args: if arg._is_global_reduction: arg.data._finalise_reduction_begin(max_grid_size, arg.access) arg.data._finalise_reduction_end(max_grid_size, arg.access) else: # Set write state to False maybe_setflags(arg.data._data, write=False) # Data state is updated in finalise_reduction for Global if arg.access is not op2.READ: arg.data.state = DeviceDataMixin.DEVICE else: arglist.append(self._plan.ind_map.gpudata) arglist.append(self._plan.loc_map.gpudata) arglist.append(self._plan.ind_sizes.gpudata) arglist.append(self._plan.ind_offs.gpudata) arglist.append(None) # Block offset arglist.append(self._plan.blkmap.gpudata) arglist.append(self._plan.offset.gpudata) arglist.append(self._plan.nelems.gpudata) arglist.append(self._plan.nthrcol.gpudata) arglist.append(self._plan.thrcol.gpudata) arglist.append(None) # Number of colours in this block block_offset = 0 for col in xrange(self._plan.ncolors): # At this point, before we can continue processing in # the MPI case, we'll need to wait for halo swaps to # complete, but at the moment we don't support that # use case, so we just pass through for now. if col == self._plan.ncolors_core: pass blocks = self._plan.ncolblk[col] if blocks > 0: arglist[-1] = np.int32(blocks) arglist[-7] = np.int32(block_offset) blocks = np.asscalar(blocks) # Compute capability < 3 can handle at most 2**16 - 1 # blocks in any one dimension of the grid. if blocks >= 2**16: grid_size = (2**16 - 1, (blocks - 1)/(2**16-1) + 1, 1) else: grid_size = (blocks, 1, 1) block_size = (128, 1, 1) shared_size = np.asscalar(self._plan.nsharedCol[col]) # Global reductions require shared memory of at least block # size * sizeof(double) for the reduction buffer if any(arg._is_global_reduction for arg in self.args): shared_size = max(128 * 8, shared_size) _stream.synchronize() fun(grid_size, block_size, _stream, *arglist, shared_size=shared_size) # We've reached the end of elements that should # contribute to a reduction (this is only different # from the total number of elements in the MPI case). # So copy the reduction array back to the host now (so # that we don't double count halo elements). We'll # finalise the reduction a little later. if col == self._plan.ncolors_owned - 1: for arg in self.args: if arg._is_global_reduction: arg.data._finalise_reduction_begin(max_grid_size, arg.access) block_offset += blocks for arg in self.args: if arg._is_global_reduction: arg.data._finalise_reduction_end(max_grid_size, arg.access) elif not arg._is_mat: # Data state is updated in finalise_reduction for Global if arg.access is not op2.READ: arg.data.state = DeviceDataMixin.DEVICE else: # Mat, assemble from lma->csr arg.data._assemble(rowmap=arg.map[0], colmap=arg.map[1]) if self._has_soa: op2stride.remove_from_namespace()
def compute(self): if self._has_soa: op2stride = Const(1, self._it_space.size, name='op2stride', dtype='int32') conf = self.launch_configuration() if self._is_indirect: self._plan = Plan(self.kernel, self._it_space.iterset, *self._unwound_args, partition_size=conf['partition_size'], matrix_coloring=self._requires_matrix_coloring) conf['local_memory_size'] = self._plan.nshared conf['ninds'] = self._plan.ninds conf['work_group_size'] = min(_max_work_group_size, conf['partition_size']) conf['work_group_count'] = self._plan.nblocks conf['warpsize'] = _warpsize fun = JITModule(self.kernel, self.it_space.extents, *self.args, parloop=self, conf=conf) args = [] for arg in self._unique_args: arg.data._allocate_device() if arg.access is not device.WRITE: arg.data._to_device() for a in self._unique_dat_args: args.append(a.data.array.data) for a in self._all_global_non_reduction_args: args.append(a.data._array.data) for a in self._all_global_reduction_args: a.data._allocate_reduction_array(conf['work_group_count']) args.append(a.data._d_reduc_array.data) for cst in Const._definitions(): args.append(cst._array.data) for m in self._unique_matrix: args.append(m._dev_array.data) m._upload_array() args.append(m._rowptr.data) args.append(m._colidx.data) for m in self._matrix_entry_maps: m._to_device() args.append(m._device_values.data) if self._is_direct: args.append(np.int32(self._it_space.size)) fun(conf['thread_count'], conf['work_group_size'], *args) else: args.append(np.int32(self._it_space.size)) args.append(self._plan.ind_map.data) args.append(self._plan.loc_map.data) args.append(self._plan.ind_sizes.data) args.append(self._plan.ind_offs.data) args.append(self._plan.blkmap.data) args.append(self._plan.offset.data) args.append(self._plan.nelems.data) args.append(self._plan.nthrcol.data) args.append(self._plan.thrcol.data) block_offset = 0 args.append(0) for i in range(self._plan.ncolors): blocks_per_grid = int(self._plan.ncolblk[i]) threads_per_block = min(_max_work_group_size, conf['partition_size']) thread_count = threads_per_block * blocks_per_grid args[-1] = np.int32(block_offset) fun(int(thread_count), int(threads_per_block), *args) block_offset += blocks_per_grid # mark !READ data as dirty for arg in self.args: if arg.access is not READ: arg.data.state = DeviceDataMixin.DEVICE if arg._is_dat: maybe_setflags(arg.data._data, write=False) for mat in [arg.data for arg in self._matrix_args]: mat.assemble() for a in self._all_global_reduction_args: a.data._post_kernel_reduction_task(conf['work_group_count'], a.access) if self._has_soa: op2stride.remove_from_namespace()
def _compute(self, part): arglist = [np.int32(part.size), np.int32(part.offset)] config = self.launch_configuration(part) fun = JITModule(self.kernel, self.it_space, *self.args, parloop=self, config=config) if self._is_direct: _args = self.args block_size = config['block_size'] max_grid_size = config['grid_size'] shared_size = config['required_smem'] else: _args = self._unique_args maxbytes = sum([a.dtype.itemsize * a.data.cdim for a in self._unwound_args if a._is_indirect]) # shared memory as reported by the device, divided by some # factor. This is the same calculation as done inside # op_plan_core, but without assuming 48K shared memory. # It would be much nicer if we could tell op_plan_core "I # have X bytes shared memory" part_size = (_AVAILABLE_SHARED_MEMORY / (64 * maxbytes)) * 64 _plan = Plan(part, *self._unwound_args, partition_size=part_size) max_grid_size = _plan.ncolblk.max() for arg in _args: if arg._is_mat: d = arg.data._lmadata.gpudata offset = arg.data._lmaoffset(self._it_space.iterset) arglist.append(np.intp(d)) arglist.append(np.int32(offset)) else: arg.data._allocate_device() if arg.access is not op2.WRITE: arg.data._to_device() karg = arg.data._device_data if arg._is_global_reduction: arg.data._allocate_reduction_buffer(max_grid_size, arg.access) karg = arg.data._reduction_buffer arglist.append(np.intp(karg.gpudata)) if self._is_direct: _stream.synchronize() fun(max_grid_size, block_size, _stream, *arglist, shared_size=shared_size) else: arglist.append(_plan.ind_map.gpudata) arglist.append(_plan.loc_map.gpudata) arglist.append(_plan.ind_sizes.gpudata) arglist.append(_plan.ind_offs.gpudata) arglist.append(None) # Block offset arglist.append(_plan.blkmap.gpudata) arglist.append(_plan.offset.gpudata) arglist.append(_plan.nelems.gpudata) arglist.append(_plan.nthrcol.gpudata) arglist.append(_plan.thrcol.gpudata) arglist.append(None) # Number of colours in this block block_offset = 0 for col in xrange(_plan.ncolors): blocks = _plan.ncolblk[col] if blocks > 0: arglist[-1] = np.int32(blocks) arglist[-7] = np.int32(block_offset) blocks = np.asscalar(blocks) # Compute capability < 3 can handle at most 2**16 - 1 # blocks in any one dimension of the grid. if blocks >= 2 ** 16: grid_size = (2 ** 16 - 1, (blocks - 1) / (2 ** 16 - 1) + 1, 1) else: grid_size = (blocks, 1, 1) block_size = (128, 1, 1) shared_size = np.asscalar(_plan.nsharedCol[col]) # Global reductions require shared memory of at least block # size * sizeof(double) for the reduction buffer if any(arg._is_global_reduction for arg in self.args): shared_size = max(128 * 8, shared_size) _stream.synchronize() fun(grid_size, block_size, _stream, *arglist, shared_size=shared_size) block_offset += blocks _stream.synchronize() for arg in self.args: if arg._is_global_reduction: arg.data._finalise_reduction_begin(max_grid_size, arg.access) arg.data._finalise_reduction_end(max_grid_size, arg.access) elif not arg._is_mat: # Set write state to False maybe_setflags(arg.data._data, write=False) # Data state is updated in finalise_reduction for Global if arg.access is not op2.READ: arg.data.state = DeviceDataMixin.DEVICE