def __init__(self, device): self.api_id = get_id() self._device = device self.max_work_group_size = device.max_threads_per_block self.max_work_item_sizes = [ device.max_block_dim_x, device.max_block_dim_y, device.max_block_dim_z ] self.max_num_groups = [ device.max_grid_dim_x, device.max_grid_dim_y, device.max_grid_dim_z ] # there is no corresponding constant in the API at the moment self.local_mem_banks = 16 if device.compute_capability()[0] < 2 else 32 self.warp_size = device.warp_size devdata = DeviceData(device) self.min_mem_coalesce_width = dict( ((size, devdata.align_words(word_size=size)) for size in [4, 8, 16])) self.local_mem_size = device.max_shared_memory_per_block self.compute_units = device.multiprocessor_count
def __init__(self, device): self.api_id = get_id() self._device = device self.max_work_group_size = device.max_threads_per_block self.max_work_item_sizes = [ device.max_block_dim_x, device.max_block_dim_y, device.max_block_dim_z] self.max_num_groups = [ device.max_grid_dim_x, device.max_grid_dim_y, device.max_grid_dim_z] # there is no corresponding constant in the API at the moment self.local_mem_banks = 16 if device.compute_capability()[0] < 2 else 32 self.warp_size = device.warp_size devdata = DeviceData(device) self.min_mem_coalesce_width = dict( ((size,devdata.align_words(word_size=size)) for size in [4, 8, 16])) self.local_mem_size = device.max_shared_memory_per_block self.compute_units = device.multiprocessor_count
def __init__(self, device, stream, mempool): self._stream = stream self._recreate_stream = stream is None devdata = DeviceData(device) self.min_mem_coalesce_width = {} for size in [4, 8, 16]: self.min_mem_coalesce_width[size] = devdata.align_words( word_size=size) self.num_smem_banks = devdata.smem_granularity self.max_registers = device.get_attribute( device_attribute.MAX_REGISTERS_PER_BLOCK) self.max_grid_x = 2**log2( device.get_attribute(device_attribute.MAX_GRID_DIM_X)) self.max_grid_y = 2**log2( device.get_attribute(device_attribute.MAX_GRID_DIM_Y)) self.max_block_size = device.get_attribute( device_attribute.MAX_BLOCK_DIM_X) self.max_shared_mem = device.get_attribute( device_attribute.MAX_SHARED_MEMORY_PER_BLOCK) if mempool is None: self.allocate = cuda.mem_alloc else: self._mempool = mempool self.allocate = mempool.allocate
def __init__(self, device, stream, mempool): self._stream = stream self._recreate_stream = stream is None devdata = DeviceData(device) self.min_mem_coalesce_width = {} for size in [4, 8, 16]: self.min_mem_coalesce_width[size] = devdata.align_words(word_size=size) self.num_smem_banks = devdata.smem_granularity self.max_registers = device.get_attribute(device_attribute.MAX_REGISTERS_PER_BLOCK) self.max_grid_x = 2 ** log2(device.get_attribute(device_attribute.MAX_GRID_DIM_X)) self.max_grid_y = 2 ** log2(device.get_attribute(device_attribute.MAX_GRID_DIM_Y)) self.max_block_size = device.get_attribute(device_attribute.MAX_BLOCK_DIM_X) self.max_shared_mem = device.get_attribute(device_attribute.MAX_SHARED_MEMORY_PER_BLOCK) if mempool is None: self.allocate = cuda.mem_alloc else: self._mempool = mempool self.allocate = mempool.allocate