def elementwise_add(op: Sgemm) -> List[Kernel]: A = op.inputs["A"] B = op.inputs["B"] C = op.outputs["C"] assert ChannelMode.get_mode(A) == ChannelMode.get_mode(B) name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "A": A, "B": B, "s_c": texture_stride(C), "d_C": [op.M, op.N], "s_C": [op.N, 1], "d_a": texture_shape(A), "s_a": texture_stride(A), "s_A": [op.K, 1] if op.transpose_A else [1, op.M], "d_b": texture_shape(B), "s_b": texture_stride(B), "s_B": [op.N, 1] if op.transpose_B else [1, op.K], "K": op.K }) source = generate_template(mode=ChannelMode.get_mode(A), K=op.K) source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel(source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, C) return [kernel]
def convert_rgba_to_r(op: ConvertRGBAtoR) -> List[Kernel]: x0 = op.inputs["x0"] y = op.outputs["y"] name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "X0": x0, "s_y": texture_stride(y), "d_x0": texture_shape(x0), "s_x0": texture_stride(x0), }) source = template source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel( source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, y ) return [kernel]
def convert_rgba_to_r(op: ConvertRGBAtoR) -> List[Kernel]: x0 = op.inputs["x0"] y = op.outputs["y"] assert ChannelMode.get(x0) == ChannelModeEnum.RGBA assert ChannelMode.get(y) == ChannelModeEnum.R if x0.order != y.order: raise NotImplementedError name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "X0": x0, "s_y": texture_stride(y), "d_x0": texture_shape(x0), "s_x0": texture_stride(x0), }) source = template source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel(source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, y) return [kernel]
def reinterpret_axis(op: ReinterpretAxis) -> List[Kernel]: x = op.inputs["x"] y = op.outputs["y"] name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "X": x, "s_y": texture_stride(y), "d_x": texture_shape(x), "s_x": texture_stride(x), }) source = template source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel( source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, y ) return [kernel]
def space2depth(op: Space2Depth) -> List[Kernel]: x = op.inputs["x"] y = op.outputs["y"] r = op.parameters['r'] assert x.order == OrderNHWC assert y.order == OrderNHWC name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "X": x, "s_y": texture_stride(y), "d_Y": y.shape, "s_Y": y.stride, "d_x": texture_shape(x), "s_x": texture_stride(x), "d_X": x.shape, "s_X": x.stride, "r": r, "C1": x.shape_dict[Axis.C], }) source = template source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel(source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, y) return [kernel]
def elementwise_add(op: ClippedRelu) -> List[Kernel]: x0 = op.inputs["x0"] y = op.outputs["y"] shapes, strides = optimize_loop_structure([x0, y], y) name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "X0": x0, "s_y": texture_stride(y), "d_Y": shapes[y], "s_Y": strides[y], "d_x0": texture_shape(x0), "s_x0": texture_stride(x0), "d_X0": shapes[x0], "s_X0": strides[x0], "cap": op.parameters["cap"] }) source = template source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel(source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, y) return [kernel]
def average_pooling_2d(op: Unpooling2D) -> List[Kernel]: x = op.inputs["x"] y = op.outputs["y"] assert x.order == OrderNHWC assert y.order == OrderNHWC name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "X": x, "s_y": texture_stride(y), "d_Y": y.shape, "s_Y": y.stride, "d_x": texture_shape(x), "s_x": texture_stride(x), "s_X": x.stride, "C1": x.shape_dict[Axis.C], "H1": x.shape_dict[Axis.H], "W1": x.shape_dict[Axis.W], "SH": op.parameters["stride"][0], "SW": op.parameters["stride"][1], "PH": op.parameters["padding"][0], "PW": op.parameters["padding"][1], }) source = generate_template(ksize=op.parameters["ksize"]) source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel(source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, y) return [kernel]
def elementwise_add(op: Tanh) -> List[Kernel]: x0 = op.inputs["x0"] y = op.outputs["y"] shapes, strides = optimize_loop_structure([x0, y], y) name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "X0": x0, "s_y": texture_stride(y), "d_Y": shapes[y], "s_Y": strides[y], "d_x0": texture_shape(x0), "s_x0": texture_stride(x0), "d_X0": shapes[x0], "s_X0": strides[x0], }) source = template_R if ChannelMode.get( y) == ChannelModeEnum.R else template_RGBA source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel(source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, y) return [kernel]
def convert_r_to_rgba(op: ConvertRtoRGBA) -> List[Kernel]: x = op.inputs["x0"] y = op.outputs["y"] assert ChannelMode.get(x) == ChannelModeEnum.R assert ChannelMode.get(y) == ChannelModeEnum.RGBA orders, shape_dicts = simplify_orders([x, y]) shapes = {v: [shape_dicts[v][a] for a in orders[v].axes] for v in [x, y]} strides = { v: [mul(shapes[v][orders[v].axes_dict[a] + 1:]) for a in orders[v].axes] for v in [x, y] } stride_dicts = {v: AxisKeyDict(orders[v].axes, strides[v]) for v in [x, y]} # Change x's shapes and strides order to same as y's order shapes[x] = [ shape_dicts[x][a] if a in orders[x].axes else 1 for a in orders[y].axes ] strides[x] = [ stride_dicts[x][a] if a in orders[x].axes else 1 for a in orders[y].axes ] # Padding shapes and strides to 4D if orders[y].ndim > 4: raise NotImplementedError(f"Too large number of dimension: {y}") for v in [x, y]: shape = shapes[v] stride = strides[v] while len(shape) < 4: stride.append(1) shape.append(1) name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "sampler_x": x, "texture_stride_y": texture_stride(y), "variable_shape_y": shapes[y], "variable_stride_y": strides[y], "texture_shape_x": texture_shape(x), "texture_stride_x": texture_stride(x), "variable_shape_x": shapes[x], "variable_stride_x": strides[x], }) source = template source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel(source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, y) return [kernel]
def partial_im2col(op: PartialIm2Col) -> List[Kernel]: im = op.inputs["im"] cols = [op.outputs[f"col{i}"] for i in range(len(op.outputs))] sections = [0] + op.sections axis = op.axis kernels = [] for i, col in enumerate(cols): assert im.order == col.order == OrderNHWC assert ChannelMode.get(im) == ChannelModeEnum.R name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() offset = [sections[i] if a == axis else 0 for a in col.order.axes] uniform_injector.register({ "sampler_im": im, "texture_stride_col": texture_stride(col), "variable_shape_col": col.shape, "variable_stride_col": col.stride, "offset_col": offset, "texture_shape_im": texture_shape(im), "texture_stride_im": texture_stride(im), "variable_shape_im": im.shape, "variable_stride_im": im.stride, "C1": im.shape_dict[Axis.C], "H1": im.shape_dict[Axis.H], "W1": im.shape_dict[Axis.W], "KH": op.KH, "KW": op.KW, "DH": op.DH, "DW": op.DW, "SH": op.SH, "SW": op.SW, "PH": op.PH, "PW": op.PW, }) source = template_R if ChannelMode.get(col) == ChannelModeEnum.R else template_RGBA source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel( source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, col ) kernels.append(kernel) return kernels
def col2im(op: Col2Im) -> List[Kernel]: col = op.inputs["col"] im = op.outputs["im"] assert col.order == OrderNHWC assert im.order == OrderNHWC assert ChannelMode.get(col) == ChannelModeEnum.R assert ChannelMode.get(im) == ChannelModeEnum.R name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "col": col, "s_im": texture_stride(im), "d_Im": im.shape, "s_Im": im.stride, "d_col": texture_shape(col), "s_col": texture_stride(col), "d_Col": col.shape, "s_Col": col.stride, "H2": col.shape_dict[Axis.H], "W2": col.shape_dict[Axis.W], "C1": im.shape_dict[Axis.C], "SH": op.SH, "SW": op.SW, "PH": op.PH, "PW": op.PW, }) source = generate_template(op) source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel( source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, im ) return [kernel]
def elementwise_kernel(op: Elementwise): xs = list(op.inputs.values()) y = op.outputs["y"] shapes, strides = _optimize_loop_structure(xs + [y], y) name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "texture_stride_y": texture_stride(y), "variable_shape_y": shapes[y], "variable_stride_y": strides[y] }) for k, v in op.inputs.items(): uniform_injector.register({ f"sampler_{k}": v, f"texture_shape_{k}": texture_shape(v), f"texture_stride_{k}": texture_stride(v), f"variable_shape_{k}": shapes[v], f"variable_stride_{k}": strides[v], }) for name, callable in _registered_items[op.__class__].parameters.items(): uniform_injector.register({name: callable(op)}) if all([ x.shape == y.shape and x.order == y.order and texture_shape(x) == texture_shape(y) for x in xs ]): # For all variables, not only element position (=logical position), pixel position (=actual position) is also same. # Therefore computing logical position is no need. source = _generate_template_no_convert_position(op) else: # Computing logical position is required. source = _generate_template_convert_position(op) source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel(source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, y) return [kernel]
def split_axis(op: SplitAxis) -> List[Kernel]: x = op.inputs["x"] ys = [op.outputs[f"y{i}"] for i in range(len(op.outputs))] sections = [0] + op.sections axis = op.axis kernels = [] for i, y in enumerate(ys): assert x.order.check_same_axes(y.order) assert ChannelMode.get(x) == ChannelMode.get(y) == ChannelModeEnum.R name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() offset = [sections[i] if a == axis else 0 for a in y.order.axes] uniform_injector.register({ "sampler_x": x, "texture_stride_y": texture_stride(y), "variable_shape_y": _pad_to_4d(y.shape), "variable_stride_y": _pad_to_4d(y.stride), "texture_shape_x": texture_shape(x), "texture_stride_x": texture_stride(x), "variable_shape_x": _pad_to_4d([x.shape_dict[a] for a in y.order.axes]), "variable_stride_x": _pad_to_4d([x.stride_dict[a] for a in y.order.axes]), "offset": _pad_to_4d(offset, 0) }) source = template source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel(source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, y) kernels.append(kernel) return kernels
def tensordot(op: Tensordot) -> List[Kernel]: A = op.inputs["A"] B = op.inputs["B"] C = op.outputs["C"] axes = op.axes assert ChannelMode.get(A) == ChannelMode.get(B) assert ChannelMode.get(C) == ChannelModeEnum.R # Reduced axes must be located on inside of input variables. assert A.order.axes[-len(axes[0]):] == axes[0] assert B.order.axes[-len(axes[1]):] == axes[1] # output variable's axes order must be as [*a_remained_axes, *b_remained_axes] assert C.order.axes[:A.ndim - len(axes[0])] == A.order.axes[:-len(axes[0])] assert C.order.axes[-(B.ndim - len(axes[1])):] == B.order.axes[:-len(axes[1])] assert C.ndim == A.ndim - len(axes[0]) + B.ndim - len(axes[1]) K = mul(A.shape[-len(axes[0]):]) M = A.size // K N = B.size // K name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "A": A, "B": B, "s_c": texture_stride(C), "d_C": [M, N], "s_C": [N, 1], "d_a": texture_shape(A), "d_b": texture_shape(B), "K": K }) source = generate_template(mode=ChannelMode.get(A), reduction_size=K) source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel(source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, C) return [kernel]
def im2col(op: Im2Col) -> List[Kernel]: im = op.inputs["im"] col = op.outputs["col"] assert im.order == OrderNHWC assert col.order == OrderNHWC assert ChannelMode.get(im) == ChannelModeEnum.R name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "im": im, "s_col": texture_stride(col), "d_Col": col.shape, "s_Col": col.stride, "d_im": texture_shape(im), "s_im": texture_stride(im), "d_Im": im.shape, "s_Im": im.stride, "C1": im.shape_dict[Axis.C], "H1": im.shape_dict[Axis.H], "W1": im.shape_dict[Axis.W], "KH": op.KH, "KW": op.KW, "DH": op.DH, "DW": op.DW, "SH": op.SH, "SW": op.SW, "PH": op.PH, "PW": op.PW, }) source = template_R if ChannelMode.get( col) == ChannelModeEnum.R else template_RGBA source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel(source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, col) return [kernel]
def concat(op: Concat) -> List[Kernel]: xs = [op.inputs[f"x{i}"] for i in range(len(op.inputs) - 1)] workspace = op.inputs["workspace"] y = op.outputs["y"] axis = op.axis kernels = [] sections = [0] for x in xs[1:]: sections.append(sections[-1] + x.shape_dict[axis]) for i, x in enumerate(xs): assert x.order.check_same_axes(y.order) assert ChannelMode.get(x) == ChannelMode.get(y) offset = [sections[i] if a == axis else 0 for a in y.order.axes] name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "sampler_x": x, "sampler_workspace": workspace, "texture_shape_workspace": texture_shape(workspace), "texture_stride_y": texture_stride(y), "variable_shape_y": _pad_to_4d(y.shape), "variable_stride_y": _pad_to_4d(y.stride), "texture_shape_x": texture_shape(x), "texture_stride_x": texture_stride(x), "variable_shape_x": _pad_to_4d([x.shape_dict[a] for a in y.order.axes]), "variable_stride_x": _pad_to_4d([x.stride_dict[a] for a in y.order.axes]), "offset": _pad_to_4d(offset, 0) }) source = template source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel( source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, y ) kernels.append(kernel) name_injector2 = KernelNameInjector(op) uniform_injector2 = UniformInjector() uniform_injector2.register({ "sampler_y": y, "texture_shape_y": texture_shape(y), }) source2 = template2 source2 = uniform_injector2.inject(source2) source2 = name_injector2.inject(source2) kernel2 = Kernel( source2, name_injector2.name, uniform_injector2.samplers, uniform_injector2.uniforms, workspace ) kernels.append(kernel2) return kernels
def reduce_kernel(op: Reduce): x = op.inputs["x"] y = op.outputs["y"] axis = op.axis orders, shape_dicts = simplify_orders([x, y], keep_axes=[axis]) # Padding shapes and strides to 4D if orders[y].ndim > 4: raise NotImplementedError(f"Too large number of dimension: {y}") shapes = {v: [shape_dicts[v][a] for a in orders[v].axes] for v in [x, y]} strides = { v: [mul(shapes[v][orders[v].axes_dict[a] + 1:]) for a in orders[v].axes] for v in [x, y] } stride_dicts = {v: AxisKeyDict(orders[v].axes, strides[v]) for v in [x, y]} # Change x's shapes and strides order to same as y's order x_virtual_shape = [ shape_dicts[x][a] if a in orders[x].axes else 1 for a in orders[y].axes ] x_virtual_stride = [ stride_dicts[x][a] if a in orders[x].axes else 1 for a in orders[y].axes ] while len(x_virtual_shape) < 3: x_virtual_stride.append(1) x_virtual_shape.append(stride_dicts[x][axis]) x_virtual_shape.append(shape_dicts[x][axis]) x_virtual_stride.append(stride_dicts[x][axis]) y_virtual_shape = shapes[y] y_virtual_stride = strides[y] while len(y_virtual_shape) < 4: y_virtual_stride.append(1) y_virtual_shape.append(1) name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "texture_stride_y": texture_stride(y), "variable_shape_y": y_virtual_shape, "variable_stride_y": y_virtual_stride, f"sampler_x": x, f"texture_shape_x": texture_shape(x), f"texture_stride_x": texture_stride(x), f"variable_shape_x": x_virtual_shape, f"variable_stride_x": x_virtual_stride, }) for name, callable in _registered_items[op.__class__].parameters.items(): uniform_injector.register({name: callable(op)}) # Computing logical position is required. source = _generate_template_convert_position( op, reduction_size=shape_dicts[x][axis]) source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel(source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, y) return [kernel]