def _optimize_loop_axis(dim): """ Chooses kernel parameters including CUDA block size, grid size, and number of elements to compute per thread for the loop axis. The loop axis is the axis of the tensor for which a thread can compute multiple outputs. Uses a simple heuristic which tries to get at least 4 warps per block and 8 items per thread to hide latencies. Prefers a higher item-per-thread to launching many blocks for very large axes since blocks are serialized by the GPU after all SMs are filled. Arguments: dim (int): Size of the tensor on the loop axis. Returns: tuple of grid dimension, block dimension, and items per thread """ sm_count = _get_sm_count() griddim = min(sm_count, -((-dim) // 32)) items_per_block = -((-dim) // griddim) items_per_thread = 1 warps = -((-items_per_block) // (32 * items_per_thread)) while (warps > 4 and items_per_thread < 8) or (warps > 32): items_per_thread = items_per_thread + 1 warps = -((-items_per_block) // (32 * items_per_thread)) blockdim = warps * 32 return (griddim, blockdim, items_per_thread)
def __init__(self, device_id=None, enable_winograd=True, deterministic=True, scratch_size=0): drv.init() self.device_id = device_id if device_id is not None else 0 # check compute capability self.compute_capability = drv.Device( self.device_id).compute_capability() if self.compute_capability[0] < 3: raise RuntimeError("Unsupported GPU") # context self.ctx = drv.Device(self.device_id).make_context() # attributes self.stream = None self.warmup = False self.scratch_size = scratch_size self.scratch_offset = 0 self.sm_count = _get_sm_count() # store GPU memory size in bytes self.gpu_memory_size = drv.mem_get_info()[1] # Fall back to CUDA C kernels on older (pre-Maxwell) GPU generations if self.compute_capability[0] < 5: # TODO: this is not fully supported in graph yet self.use_cudac_kernels = True else: self.use_cudac_kernels = False # TODO # self.cublas_handle = cublas.cublasCreate() self.pcg = rng_mrg() self.enable_winograd = enable_winograd self.deterministic = deterministic self.cache_dir = get_cache_dir()
def sm_count(self): return _get_sm_count()
def _build_maxas_kernel(self, op, size=None): """ Uses tensor dimensions and axis ordering to select a sass kernel and use maxas to compile it for later use. Arguments: op (DotOp): Graph op being transformed into this kernel size (str): Optional preselected tile size """ # Get inputs to gemm C = TensorDescriptionWrapper(op.tensor_description(), 2) A, B = (TensorDescriptionWrapper(_, 2) for _ in op.call_info()) # If both inputs are 1d, need to transpose one of them if min(A.strides) == 0 and min(B.strides) == 0: A.strides = tuple(reversed(A.strides)) A.shape = tuple(reversed(A.shape)) vector_dot = True else: vector_dot = False self.C = C self.A = A self.B = B # Kernels only support 2d tensors assert len(A.shape) == 2 assert len(B.shape) == 2 assert len(C.shape) == 2 # one dimension must be contiguous assert min(A.strides) == 1 or max(A.strides) == 1 assert min(B.strides) == 1 or max(B.strides) == 1 assert min(C.strides) == 1 or max(C.strides) == 1 or vector_dot lda = max(A.strides) ldb = max(B.strides) ldc = max(C.strides) if A.is_trans: opA = 't' if size not in ("32x64", "16x64"): lda *= 8 * A.dtype.itemsize # saves a kernel register else: opA = 'n' if B.is_trans: opB = 't' else: opB = 'n' if size not in ("32x64", "16x64"): ldb *= 8 * B.dtype.itemsize # saves a kernel register op = opA + opB assert op != "tt" m = A.shape[0] n = B.shape[1] k = A.shape[1] assert m == C.shape[0] assert n == C.shape[1] assert k == B.shape[0] # Flex only has the 128x128 tile size if C.is_flex(): size = "128x128" # Some basic tile size selection. # Your best bet is to benchmark your code with all 3 sizes # and manually fine tune the selection for each layer. # TODO: Perhaps I'll add an autotuning mode. if size is None: # find the shorter side short = min(m, n) # anything bigger than this just use 128 if short < 384 - 16: # compute remainder of 128 short128 = short % 128 # if remainder is more than 112 just use 128 if 0 < short128 < 112: # to figure out when to use 64 over 32 we need to calc # occupancy at 64 if 48 < short128 <= 64: occupancy64 = short // 64 wide = max(m, n) occupancy64 *= (wide // 128 + (wide % 128 != 0)) // _get_sm_count() # 64 is only faster than 32 when occupancy is more than # 1 warp per scheduler. if occupancy64 > 1: size = 64 else: size = 32 else: size = 32 else: size = 128 # There's a large regime where 64 is faster, but it's hard to # characterize else: size = 128 # match the kernel to the optimal short size but avoid not # implemented kernels if m >= n: if op == "nt": size = 128 sizeA, sizeB = (128, size) else: if op == "tn": size = 128 # temp till I can write these kernels (coming soon) elif size == 64: size = 32 sizeA, sizeB = (size, 128) size = "%dx%d" % (sizeA, sizeB) else: sizeA, sizeB = (int(s) for s in size.split('x')) gridA = m // sizeA + (m % sizeA != 0) gridB = n // sizeB + (n % sizeB != 0) k_vec = 8 if sizeA in (16, 32) or sizeB == 32 else 16 vec_opt = None if op == "tn": if (m % 4 == 0 and n % 4 == 0 and A.strides[1] % 4 == 0 and B.strides[0] % 4 == 0): vec_opt = ("vec", ) elif op == "nn": if (k % k_vec == 0 and n % 4 == 0 and A.strides[0] % k_vec == 0 and B.strides[0] % 4 == 0): vec_opt = ("vec", ) elif op == "nt": if (k % k_vec == 0 and n % 4 == 0 and A.strides[0] % k_vec == 0 and B.strides[1] % k_vec == 0): vec_opt = ("vec", ) # nt and nn are more efficient with k%16==0 if C.is_flex(): clss = "fgemm" elif C.dtype.type is np.float16: clss = "hgemm" elif C.dtype.type is np.float32: clss = "sgemm" else: raise TypeError("Only floating point dot currently supported.") # TODO: Flex may not have all "size" options (Urs) self.kernel = kernel_specs.get_kernel("_".join((clss, op, size)), vec_opt) # alpha, beta self.alpha = 1.0 self.beta = 0.0 # create params # if params list changes, indices in bind_flex_scales may need updating self.params = [(1, int(gridA), int(gridB)), (self.kernel.threads, 1, 1), None, C.td, A.td, B.td, self.alpha, self.beta, 0, int(lda), int(ldb), int(ldc), int(m), int(n), int(k), 0, 0, 0, 0] if clss == "fgemm": # save flex entries for bind_flex_scales self.flex_entry_A = A.flex_entry() self.flex_entry_B = B.flex_entry() self.flex_entry_C = C.flex_entry() # flex params self.params += [FlexPtrDescription(self.flex_entry_C), 1.0] # maxabs ptr, output scale # record output flex id for autoflex self.output_flex_ids = [self.flex_entry_C.flex_id]