def call_compound_kernel(rand_state, compute_capability, *args): """ Pass in a list of GPUTensor objects, constants and operators in postfix notation.. C += 2.5 * A * B + 1 call_compound_ew_kernel(C, 2.5, A, "mul", B, "mul", 1, "add", C, "add", "assign") """ out = None arg_cnt = 0 op_cnt = 0 array_ids = {} const_ids = {} kernel_args = [ rand_state, ] type_args = [] shape_stack = [] threads = 32 red_depth = 0 # Apply reduction constraints and determine thread axis # Blocks will be allocated counter to this axis # Also detect if this is a broadcast or transpose op. contiguous = True reduction = False broadcast = False transpose = False argminmax = False takeop = False axis = 1 for arg in args: if type(arg) is dict: op_name = arg["op"] if op_name in _reduction_ops: if op_name[0:3] == "arg": argminmax = True # To reduce a whole tensor (axis=None) reduce along each axis # in succession. if arg.get("axis", None) not in (0, 1): raise ValueError( "Only reduction along an axis currently supported") # Keep axis values consistent within the same kernel if reduction is True: if arg["axis"] != axis: raise ValueError( "Reduction only allowed along one axis per kernel." ) else: reduction = True axis = arg["axis"] elif op_name == "onehot": takeop = True elif isinstance(arg, ng.GPUTensor): if len(arg.shape) < 2: broadcast = True elif (len(arg.shape) == 2 and (arg.shape[0] == 1 or arg.shape[1] == 1)): broadcast = True elif arg.is_trans: transpose = True elif arg.take_array: takeop = True elif not arg.is_contiguous: contiguous = False # If reducing along axis 0 we need to reverse all stridess. # Each block gets a column and the threads work down the columns. strides_order = 1 if axis == 1 else -1 for arg in args: # Array operand if isinstance(arg, ng.GPUTensor): # for complex operations, use the native dimensions if broadcast or reduction or transpose or takeop or not contiguous: if len(arg.shape) == 2: shape = arg.shape strides = list(arg.strides[::strides_order]) else: raise ValueError( "Operations that are not simple elementwise are only " "currently supported in 2 dimensions.") # use more efficient 2d dimensions if this is a plain ew op. else: shape, strides = _get_fast_ew_dims(arg.size) strides = list(strides[::strides_order]) # If same array is passed in multiple times to expression, # consolidate them into one kernel argument. if arg in array_ids: indx = array_ids[arg] else: # The first array passed in should be the output. # It's ok if this array is duplicated as the first instance # needs to be a mutable pointer. # A subsequent instance of out (if present) will be a const # pointer. if out is None: out = arg indx = arg_cnt else: indx = array_ids[arg] = arg_cnt arg_cnt += 1 # support broadcast # Need to use shape of base array to determin stride if this # operation is a take if arg.take_array: if arg.base.shape[0] == 1: strides[1 - axis] = 0 if arg.base.shape[1] == 1: strides[axis] = 0 else: if shape[0] == 1: strides[1 - axis] = 0 if shape[1] == 1: strides[axis] = 0 kernel_args.extend((arg.gpudata, strides[0], strides[1])) # fancy indexing/take if arg.take_array: kernel_args.append(arg.take_array[0].gpudata) # swap the take axis when reducing axis=0 # also add 1 to distinguish between no take operations if arg.take_array: if axis != 1: take_axis = 2 - arg.take_array[1] else: take_axis = arg.take_array[1] + 1 # no take operation else: take_axis = 0 type_args.append((ng.GPUTensor, indx, arg.dtype.str[1:], take_axis, shape[axis] == 1)) shape_stack.append(shape) # Constant operand elif type(arg) in (int, float): arg = float(arg) if arg in const_ids: indx = const_ids[arg] else: indx = const_ids[arg] = arg_cnt arg_cnt += 1 kernel_args.append(arg) type_args.append((float, indx)) shape_stack.append((1, 1)) # Operation elif type(arg) is dict: op_name = arg["op"] if op_name in _float_ops: # we need to do the shape arithemtic for the current operation max_shape = [1, 1] for op_num in range(_float_ops[op_name][0]): shape = shape_stack.pop() for i in range(2): if shape[i] != max_shape[i]: # support broadcast # TODO: don't allow output tensor itself to be broadcastable. # The final output is fine as a broadcast, for example # assigning a constant. # You just dont want a tensor being assigned to a # smaller shape. if shape[i] == 1 or max_shape[i] == 1: max_shape[i] = max(max_shape[i], shape[i]) else: raise TypeError( "Input shape:%s not compatible" % (shape, )) if op_name == "assign": # the axis dim is the thread loop stop condition kernel_args.append(max_shape[axis]) rounding = out.rounding # support rounding to arbitrary mantissa size if rounding: # convert bool to some default mantissa if rounding is True: rounding = 10 elif out.dtype.type is np.float32: rounding = min(rounding, 15) elif out.dtype.type is np.float16: rounding = min(rounding, 10) kernel_args.append(max(rounding, 1)) # speed up deep reduction by using more than 32 threads if not argminmax: if reduction: if red_depth >= 256: threads = 64 # Try to bring this code back after figuring out race conditions # if red_depth >= 4096: # threads = 1024 # elif red_depth >= 2048: # threads = 512 # elif red_depth >= 1024: # threads = 256 # elif red_depth >= 512: # threads = 128 # elif red_depth >= 256: # threads = 64 # speed up deep broadcast by using more than 32 threads elif not (reduction or transpose) and max_shape[1] >= 512: threads = 256 type_args.append((op_name, op_cnt, rounding > 0, threads)) elif op_name == "onehot": # flip the one hot axis if reducing axis=0 hot_axis = arg["axis"] if axis else 1 - arg["axis"] type_args.append((op_name, op_cnt, hot_axis)) shape_stack.append(max_shape) kernel_args.append(arg["idx"].gpudata) else: type_args.append((op_name, op_cnt)) shape_stack.append(max_shape) elif op_name in _reduction_ops: shape = list(shape_stack.pop()) red_depth = max(red_depth, shape[axis]) # Allow a new axis size if doing post reduction broadcast. # So we need to know the axis size prior to reduction. kernel_args.append(shape[axis]) type_args.append((op_name, op_cnt)) # reduce the current shape shape[axis] = 1 # udpate the current shape state shape_stack.append(shape) else: raise TypeError("%s is not a valid operation" % op_name) op_cnt += 1 else: raise TypeError( "args must be instance of GPUTensor, int, float, or dict (for operators)" ) # for s in argsprint: print s # for s in kernel_args: print s # for s in type_args: print s # get or create the kernel in the memoize cache kernel = _get_compound_kernel(tuple(type_args), compute_capability) shared = threads * 4 if reduction and threads > 32 else 0 if out.backend.bench > 1: repeat = out.backend.bench start, end = ng._get_events() start.record(out.backend.stream) else: repeat = 1 for r in range(repeat): # call the kernel with the number of blocks set as the size of the off-axis # Maxwell does well with 32 thread sized blocks, no need to autotune. # for a in kernel_args: print a kernel.prepared_async_call((max_shape[1 - axis], 1, 1), (threads, 1, 1), out.backend.stream, *kernel_args, shared_size=shared) if out.backend.bench > 1: end.record(out.backend.stream) end.synchronize() msecs = end.time_since(start) / repeat print("%7.3f msecs shape(%d,%d) blk,thd(%d,%d) %s" % (msecs, max_shape[0], max_shape[1], max_shape[1 - axis], threads, kernel.name)) return out
def call_compound_kernel(rand_state, *args): """ Pass in a list of GPUTensor objects, constants and operators in postfix notation.. C += 2.5 * A * B + 1 call_compound_ew_kernel(C, 2.5, A, "mul", B, "mul", 1, "add", C, "add", "assign") """ out = None arg_cnt = 0 op_cnt = 0 array_ids = {} const_ids = {} kernel_args = [rand_state, ] type_args = [] shape_stack = [] threads = 32 red_depth = 0 # Apply reduction constraints and determine thread axis # Blocks will be allocated counter to this axis # Also detect if this is a broadcast or transpose op. contiguous = True reduction = False broadcast = False transpose = False argminmax = False takeop = False axis = 1 for arg in args: if type(arg) is dict: op_name = arg["op"] if op_name in _reduction_ops: if op_name[0:3] == "arg": argminmax = True # To reduce a whole tensor (axis=None) reduce along each axis # in succession. if arg.get("axis", None) not in (0, 1): raise ValueError( "Only reduction along an axis currently supported") # Keep axis values consistent within the same kernel if reduction is True: if arg["axis"] != axis: raise ValueError( "Reduction only allowed along one axis per kernel.") else: reduction = True axis = arg["axis"] elif op_name == "onehot": takeop = True elif isinstance(arg, ng.GPUTensor): if len(arg.shape) < 2: broadcast = True elif (len(arg.shape) == 2 and (arg.shape[0] == 1 or arg.shape[1] == 1)): broadcast = True elif arg.is_trans: transpose = True elif arg.take_array: takeop = True elif not arg.is_contiguous: contiguous = False # If reducing along axis 0 we need to reverse all stridess. # Each block gets a column and the threads work down the columns. strides_order = 1 if axis == 1 else -1 for arg in args: # Array operand if isinstance(arg, ng.GPUTensor): # for complex operations, use the native dimensions if broadcast or reduction or transpose or takeop or not contiguous: if len(arg.shape) == 2: shape = arg.shape strides = list(arg.strides[::strides_order]) else: raise ValueError( "Operations that are not simple elementwise are only " "currently supported in 2 dimensions.") # use more efficient 2d dimensions if this is a plain ew op. else: shape, strides = _get_fast_ew_dims(arg.size) strides = list(strides[::strides_order]) # If same array is passed in multiple times to expression, # consolidate them into one kernel argument. if arg in array_ids: indx = array_ids[arg] else: # The first array passed in should be the output. # It's ok if this array is duplicated as the first instance # needs to be a mutable pointer. # A subsequent instance of out (if present) will be a const # pointer. if out is None: out = arg indx = arg_cnt else: indx = array_ids[arg] = arg_cnt arg_cnt += 1 # support broadcast if shape[0] == 1: strides[1 - axis] = 0 if shape[1] == 1: strides[axis] = 0 kernel_args.extend((arg.gpudata, strides[0], strides[1])) # fancy indexing/take if arg.take_array: kernel_args.append(arg.take_array[0].gpudata) # swap the take axis when reducing axis=0 # also add 1 to distinguish between no take operations if arg.take_array: if axis != 1: take_axis = 2 - arg.take_array[1] else: take_axis = arg.take_array[1] + 1 # no take operation else: take_axis = 0 type_args.append( (ng.GPUTensor, indx, arg.dtype.str[1:], take_axis, shape[axis]==1)) shape_stack.append(shape) # Constant operand elif type(arg) in (int, float): arg = float(arg) if arg in const_ids: indx = const_ids[arg] else: indx = const_ids[arg] = arg_cnt arg_cnt += 1 kernel_args.append(arg) type_args.append((float, indx)) shape_stack.append((1, 1)) # Operation elif type(arg) is dict: op_name = arg["op"] if op_name in _float_ops: # we need to do the shape arithemtic for the current operation max_shape = [1, 1] for op_num in range(_float_ops[op_name][0]): shape = shape_stack.pop() for i in range(2): if shape[i] != max_shape[i]: # support broadcast # TODO: don't allow output tensor itself to be broadcastable. # The final output is fine as a broadcast, for example # assigning a constant. # You just dont want a tensor being assigned to a # smaller shape. if shape[i] == 1 or max_shape[i] == 1: max_shape[i] = max(max_shape[i], shape[i]) else: raise TypeError( "Input shape:%s not compatible" % (shape,)) if op_name == "assign": # the axis dim is the thread loop stop condition kernel_args.append(max_shape[axis]) rounding = out.rounding # support rounding to arbitrary mantissa size if rounding: # convert bool to some default mantissa if rounding is True: rounding = 10 elif out.dtype.type is np.float32: rounding = min(rounding, 15) elif out.dtype.type is np.float16: rounding = min(rounding, 10) kernel_args.append(max(rounding, 1)) # speed up deep reduction by using more than 32 threads if reduction and not argminmax: if red_depth >= 4096: threads = 1024 elif red_depth >= 2048: threads = 512 elif red_depth >= 1024: threads = 256 elif red_depth >= 512: threads = 128 elif red_depth >= 256: threads = 64 # speed up deep broadcast by using more than 32 threads elif not (reduction or transpose) and max_shape[1] >= 512: threads = 256 type_args.append((op_name, op_cnt, rounding > 0, threads)) elif op_name == "onehot": # flip the one hot axis if reducing axis=0 hot_axis = arg["axis"] if axis else 1 - arg["axis"] type_args.append((op_name, op_cnt, hot_axis)) shape_stack.append(max_shape) kernel_args.append(arg["idx"].gpudata) else: type_args.append((op_name, op_cnt)) shape_stack.append(max_shape) elif op_name in _reduction_ops: shape = list(shape_stack.pop()) red_depth = max(red_depth, shape[axis]) # Allow a new axis size if doing post reduction broadcast. # So we need to know the axis size prior to reduction. kernel_args.append(shape[axis]) type_args.append((op_name, op_cnt)) # reduce the current shape shape[axis] = 1 # udpate the current shape state shape_stack.append(shape) else: raise TypeError("%s is not a valid operation" % op_name) op_cnt += 1 else: raise TypeError( "args must be instance of GPUTensor, int, float, or dict (for operators)") # for s in argsprint: print s # for s in kernel_args: print s # for s in type_args: print s # import ipdb; ipdb.set_trace() # get or create the kernel in the memoize cache kernel = _get_compound_kernel(tuple(type_args)) shared = threads * 4 if reduction and threads > 32 else 0 if out.backend.bench > 1: repeat = out.backend.bench start, end = ng._get_events() start.record(out.backend.stream) else: repeat = 1 for r in range(repeat): # call the kernel with the number of blocks set as the size of the off-axis # Maxwell does well with 32 thread sized blocks, no need to autotune. # for a in kernel_args: print a kernel.prepared_async_call((max_shape[1 - axis], 1, 1), (threads, 1, 1), out.backend.stream, *kernel_args, shared_size=shared) if out.backend.bench > 1: end.record(out.backend.stream) end.synchronize() msecs = end.time_since(start) / repeat print("%7.3f msecs shape(%d,%d) blk,thd(%d,%d) %s" % ( msecs, max_shape[0], max_shape[1], max_shape[1 - axis], threads, kernel.name)) return out