def depth2space(op: Depth2Space, memory_layout: MemoryLayout) -> List[Kernel]: x = op.inputs["x"] y = op.outputs["y"] r = op.parameters['r'] assert x.order == OrderNHWC assert y.order == OrderNHWC buffer_injector = BufferInjector() buffer_injector.register({ "depth2space_x": memory_layout[x], "depth2space_y": memory_layout[y], 'depth2space_r': r, "depth2space_N": x.shape_dict[Axis.N], "depth2space_C1": x.shape_dict[Axis.C], "depth2space_C2": y.shape_dict[Axis.C], "depth2space_H1": x.shape_dict[Axis.H], "depth2space_H2": y.shape_dict[Axis.H], "depth2space_W1": x.shape_dict[Axis.W], "depth2space_W2": y.shape_dict[Axis.W], }) name_injector = KernelNameInjector(op) source = template source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, GPUSize(8, 1, 1), GPUSize(MAX_THREADS_PER_THREADGROUP, 1, 1), buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def reinterpret_axis(op: ReinterpretAxis) -> List[Kernel]: x = op.inputs["x"] y = op.outputs["y"] name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "X": x, "s_y": texture_stride(y), "d_x": texture_shape(x), "s_x": texture_stride(x), }) source = template source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel( source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, y ) return [kernel]
def softmax(op: Softmax, memory_layout: MemoryLayout) -> List[Kernel]: x = memory_layout[op.inputs["x"]] y = memory_layout[op.outputs["y"]] assert y.variable.order == x.variable.order assert y.variable.shape == x.variable.shape axis = op.parameters["axis"] assert axis == x.variable.order.axes[ -1], "[Webassembly] Softmax supports only for aggregating last axis." buffer_injector = BufferInjector() buffer_injector.register({ "softmax_X": x, "softmax_Y": y, "softmax_N": y.variable.size // y.variable.shape_dict[axis], "softmax_C": y.variable.shape_dict[axis], }) name_injector = KernelNameInjector(op) source = template source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def arg_min_handler(op: ArgMin, memory_layout: MemoryLayout) -> List[Kernel]: x = op.inputs["x"] y = op.outputs["y"] axis = op.parameters["axis"] buffer_injector = BufferInjector() buffer_injector.register({ "argmin_X": memory_layout[x], "argmin_Y": memory_layout[y], "argmin_y_stride": y.stride, "argmin_y_shape": y.shape, "argmin_x_stride": [x.stride_dict[a] for a in y.order.axes], "argmin_D": y.ndim, "argmin_N": x.shape_dict[axis], "argmin_MAX_GID": y.size, "argmin_x_target_axis_stride": x.stride_dict[axis] }) name_injector = KernelNameInjector(op) source = template source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel( {name_injector.name: source}, name_injector.name, buffer_injector.buffer, buffer_injector.unresolved_value_list ) return [kernel]
def reshape(op: Reshape, memory_layout: MemoryLayout) -> List[Kernel]: x = op.inputs["x"] y = op.outputs["y"] if memory_layout[x].offset == memory_layout[y].offset: # Inplace return [] assert x.order == op.parameters["in_order"] assert y.order == op.parameters["out_order"] assert y.size == mul(op.parameters["out_shape"]) buffer_injector = BufferInjector() buffer_injector.register({ "reshape_x": memory_layout[x], "reshape_y": memory_layout[y], "reshape_N": y.size, }) name_injector = KernelNameInjector(op) source = template source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, GPUSize(8, 1, 1), GPUSize(MAX_THREADS_PER_THREADGROUP, 1, 1), buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def convert_rgba_to_r(op: ConvertRGBAtoR) -> List[Kernel]: x0 = op.inputs["x0"] y = op.outputs["y"] name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "X0": x0, "s_y": texture_stride(y), "d_x0": texture_shape(x0), "s_x0": texture_stride(x0), }) source = template source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel( source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, y ) return [kernel]
def space2depth(op: Space2Depth, memory_layout: MemoryLayout) -> List[Kernel]: x = memory_layout[op.inputs["x"]] y = memory_layout[op.outputs["y"]] r = op.parameters['r'] assert x.variable.order == OrderNHWC assert y.variable.order == OrderNHWC buffer_injector = BufferInjector() buffer_injector.register({ "space2depth_x": x, "space2depth_y": y, 'space2depth_r': r, "space2depth_N": x.variable.shape_dict[Axis.N], "space2depth_C1": x.variable.shape_dict[Axis.C], "space2depth_C2": y.variable.shape_dict[Axis.C], "space2depth_H1": x.variable.shape_dict[Axis.H], "space2depth_H2": y.variable.shape_dict[Axis.H], "space2depth_W1": x.variable.shape_dict[Axis.W], "space2depth_W2": y.variable.shape_dict[Axis.W], }) name_injector = KernelNameInjector(op) source = template source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, GPUSize(8, 1, 1), GPUSize(MAX_THREADS_PER_THREADGROUP, 1, 1), buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def sum_handler(op: Sum, memory_layout: MemoryLayout) -> List[Kernel]: x = op.inputs["x"] y = op.outputs["y"] axis = op.parameters["axis"] buffer_injector = BufferInjector() buffer_injector.register({ "sum_X": memory_layout[x], "sum_Y": memory_layout[y], "sum_y_stride": y.stride, "sum_y_shape": y.shape, "sum_x_stride": [x.stride_dict[a] for a in y.order.axes], "sum_D": y.ndim, "sum_N": x.shape_dict[axis], "sum_MAX_GID": y.size, "sum_x_target_axis_stride": x.stride_dict[axis] }) name_injector = KernelNameInjector(op) source = template source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel( {name_injector.name: source}, name_injector.name, GPUSize(8, 1, 1), GPUSize(MAX_THREADS_PER_THREADGROUP, 1, 1), buffer_injector.buffer, buffer_injector.unresolved_value_list ) return [kernel]
def space2depth(op: Space2Depth, memory_layout: MemoryLayout) -> List[Kernel]: x = op.inputs["x"] y = op.outputs["y"] r = op.parameters['r'] assert x.order == OrderNHWC assert y.order == OrderNHWC buffer_injector = BufferInjector() buffer_injector.register({ "space2depth_x": memory_layout[x], "space2depth_y": memory_layout[y], 'space2depth_r': r, "space2depth_N": x.shape_dict[Axis.N], "space2depth_C1": x.shape_dict[Axis.C], "space2depth_C2": y.shape_dict[Axis.C], "space2depth_H1": x.shape_dict[Axis.H], "space2depth_H2": y.shape_dict[Axis.H], "space2depth_W1": x.shape_dict[Axis.W], "space2depth_W2": y.shape_dict[Axis.W], }) name_injector = KernelNameInjector(op) source = template source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def linear(op: Linear, memory_layout: MemoryLayout) -> List[Kernel]: x = op.inputs["x"] w = op.inputs["w"] y = op.outputs["y"] assert x.order == OrderNC or x.order == OrderNHWC assert w.order == OrderCN or w.order == OrderHWCN assert y.order == OrderNC or y.order == OrderNHWC assert w.ndim == x.ndim buffer_injector = BufferInjector() buffer_injector.register({ "linear_X": memory_layout[x], "linear_Y": memory_layout[y], "linear_W": memory_layout[w], "linear_M": y.shape_dict[Axis.N], "linear_N": y.size // y.shape_dict[Axis.N], "linear_K": x.size // x.shape_dict[Axis.N], }) name_injector = KernelNameInjector(op) source = template source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel( {name_injector.name: source}, name_injector.name, buffer_injector.buffer, buffer_injector.unresolved_value_list ) return [kernel]
def softmax_same_order(op: Softmax, memory_layout: MemoryLayout) -> List[Kernel]: x = op.inputs["x"] y = op.outputs["y"] target_axis = op.parameters["axis"] target_axis_index = x.order.axes_dict[target_axis] D1 = mul(x.shape[:target_axis_index]) D2 = x.shape[target_axis_index] D3 = mul(x.shape[target_axis_index + 1:]) buffer_injector = BufferInjector() buffer_injector.register({ "softmax_X": memory_layout[x], "softmax_Y": memory_layout[y], "softmax_D1": D1, "softmax_D2": D2, "softmax_D3": D3 }) name_injector = KernelNameInjector(op) source = template_same_order source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, GPUSize(8, 1, 1), GPUSize(MAX_THREADS_PER_THREADGROUP, 1, 1), buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def average_pooling_2d(op: Unpooling2D) -> List[Kernel]: x = op.inputs["x"] y = op.outputs["y"] assert x.order == OrderNHWC assert y.order == OrderNHWC name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "X": x, "s_y": texture_stride(y), "d_Y": y.shape, "s_Y": y.stride, "d_x": texture_shape(x), "s_x": texture_stride(x), "s_X": x.stride, "C1": x.shape_dict[Axis.C], "H1": x.shape_dict[Axis.H], "W1": x.shape_dict[Axis.W], "SH": op.parameters["stride"][0], "SW": op.parameters["stride"][1], "PH": op.parameters["padding"][0], "PW": op.parameters["padding"][1], }) source = generate_template(ksize=op.parameters["ksize"]) source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel(source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, y) return [kernel]
def reinterpret_axis(op: ReinterpretAxis, memory_layout: MemoryLayout) -> List[Kernel]: # Operation without need for transposition is currently supported x = memory_layout[op.inputs["x"]] y = memory_layout[op.outputs["y"]] assert x.variable.order == op.parameters["in_order"] assert y.variable.order == op.parameters["out_order"] buffer_injector = BufferInjector() buffer_injector.register({ "reinterpret_axis_x": x, "reinterpret_axis_y": y, "reinterpret_axis_N": y.variable.size, }) name_injector = KernelNameInjector(op) source = template source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def reshape(op: Reshape, memory_layout: MemoryLayout) -> List[Kernel]: x = memory_layout[op.inputs["x"]] y = memory_layout[op.outputs["y"]] assert x.variable.order == op.parameters["in_order"] assert y.variable.order == op.parameters["out_order"] assert y.variable.size == mul(op.parameters["out_shape"]) buffer_injector = BufferInjector() buffer_injector.register({ "reshape_x": x, "reshape_y": y, "reshape_N": y.variable.size, }) name_injector = KernelNameInjector(op) source = template source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, GPUSize(8, 1, 1), GPUSize(1024, 1, 1), buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def elementwise_add(op: Tanh) -> List[Kernel]: x0 = op.inputs["x0"] y = op.outputs["y"] shapes, strides = optimize_loop_structure([x0, y], y) name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "X0": x0, "s_y": texture_stride(y), "d_Y": shapes[y], "s_Y": strides[y], "d_x0": texture_shape(x0), "s_x0": texture_stride(x0), "d_X0": shapes[x0], "s_X0": strides[x0], }) source = template_R if ChannelMode.get( y) == ChannelModeEnum.R else template_RGBA source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel(source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, y) return [kernel]
def reshape(op: Reshape, memory_layout: MemoryLayout) -> List[Kernel]: # Operation without need for transposition is currently supported x = op.inputs["x"] y = op.outputs["y"] if memory_layout[x] == memory_layout[y]: # This is inplace operation return [] assert x.order == op.parameters["in_order"] assert y.order == op.parameters["out_order"] assert y.size == mul(op.parameters["out_shape"]) buffer_injector = BufferInjector() buffer_injector.register({ "reshape_x": memory_layout[x], "reshape_y": memory_layout[y], "reshape_N": y.size, }) name_injector = KernelNameInjector(op) source = template source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def tile(op: Tile, memory_layout: MemoryLayout) -> List[Kernel]: x = op.inputs["x"] y = op.outputs["y"] buffer_injector = BufferInjector() buffer_injector.register({ "tile_x": memory_layout[x], "tile_y": memory_layout[y], "tile_y_stride": y.stride, "tile_x_stride": [x.stride_dict[a] for a in y.order.axes], "tile_x_shape": [x.shape_dict[a] for a in y.order.axes], "tile_D": x.ndim, "tile_MAX_GID": y.size, }) name_injector = KernelNameInjector(op) source = template source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, GPUSize(8, 1, 1), GPUSize(MAX_THREADS_PER_THREADGROUP, 1, 1), buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def elementwise_add(op: ClippedRelu) -> List[Kernel]: x0 = op.inputs["x0"] y = op.outputs["y"] shapes, strides = optimize_loop_structure([x0, y], y) name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "X0": x0, "s_y": texture_stride(y), "d_Y": shapes[y], "s_Y": strides[y], "d_x0": texture_shape(x0), "s_x0": texture_stride(x0), "d_X0": shapes[x0], "s_X0": strides[x0], "cap": op.parameters["cap"] }) source = template source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel(source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, y) return [kernel]
def embedding(op: Embedding, memory_layout: MemoryLayout) -> List[Kernel]: x = op.inputs["x"] w = op.inputs["w"] y = op.outputs["y"] assert x.order == OrderNT assert w.order == OrderCN assert y.order == OrderNTC buffer_injector = BufferInjector() buffer_injector.register({ "embedding_X": memory_layout[x], "embedding_Y": memory_layout[y], "embedding_W": memory_layout[w], "embedding_T": x.shape_dict[Axis.T], "embedding_N": x.shape_dict[Axis.N], "embedding_C": w.shape_dict[Axis.N] }) name_injector = KernelNameInjector(op) source = template source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, GPUSize(8, 1, 1), GPUSize(MAX_THREADS_PER_THREADGROUP, 1, 1), buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def space2depth(op: Space2Depth) -> List[Kernel]: x = op.inputs["x"] y = op.outputs["y"] r = op.parameters['r'] assert x.order == OrderNHWC assert y.order == OrderNHWC name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "X": x, "s_y": texture_stride(y), "d_Y": y.shape, "s_Y": y.stride, "d_x": texture_shape(x), "s_x": texture_stride(x), "d_X": x.shape, "s_X": x.stride, "r": r, "C1": x.shape_dict[Axis.C], }) source = template source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel(source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, y) return [kernel]
def convert_rgba_to_r(op: ConvertRGBAtoR) -> List[Kernel]: x0 = op.inputs["x0"] y = op.outputs["y"] assert ChannelMode.get(x0) == ChannelModeEnum.RGBA assert ChannelMode.get(y) == ChannelModeEnum.R if x0.order != y.order: raise NotImplementedError name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "X0": x0, "s_y": texture_stride(y), "d_x0": texture_shape(x0), "s_x0": texture_stride(x0), }) source = template source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel(source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, y) return [kernel]
def sgemm(op: Sgemm, memory_layout: MemoryLayout) -> List[Kernel]: A = memory_layout[op.inputs["A"]] B = memory_layout[op.inputs["B"]] C = memory_layout[op.outputs["C"]] buffer_injector = BufferInjector() buffer_injector.register({ "sgemm_A": A, "sgemm_B": B, "sgemm_C": C, "sgemm_M": op.M, "sgemm_N": op.N, "sgemm_K": op.K }) name_injector = KernelNameInjector(op) # transpose_X assumes fortran-order data. True means X is C-order, False means Fortran-order. # In default convolution, transpose_A == transpose_B == True. # The order of output matrix C is C-order. source = generate_template_64(op.transpose_A, op.transpose_B, op.M, op.N, op.K) source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, GPUSize((op.M + 64 - 1) // 64, (op.N + 64 - 1) // 64, 1), GPUSize(64, 1, 1), buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def axiswise_scale_same_order(op: AxiswiseScale, memory_layout: MemoryLayout) -> List[Kernel]: x = memory_layout[op.inputs["x"]] s = memory_layout[op.inputs["s"]] y = memory_layout[op.outputs["y"]] target_axis_index = x.variable.order.axes_dict[op.axis] D1 = mul(x.variable.shape[:target_axis_index]) D2 = x.variable.shape[target_axis_index] D3 = mul(x.variable.shape[target_axis_index + 1:]) buffer_injector = BufferInjector() buffer_injector.register({ "axiswise_scale_X": x, "axiswise_scale_S": s, "axiswise_scale_Y": y, "axiswise_scale_D1": D1, "axiswise_scale_D2": D2, "axiswise_scale_D3": D3 }) name_injector = KernelNameInjector(op) source = template_same_order source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def embedding(op: Embedding, memory_layout: MemoryLayout) -> List[Kernel]: x = op.inputs["x"] w = op.inputs["w"] y = op.outputs["y"] assert x.order == OrderNT assert w.order == OrderCN assert y.order == OrderNTC buffer_injector = BufferInjector() buffer_injector.register({ "embedding_X": memory_layout[x], "embedding_Y": memory_layout[y], "embedding_W": memory_layout[w], "embedding_vocabulary": w.shape_dict[Axis.C], "embedding_sequence_len": x.shape_dict[Axis.T], "embedding_batch_size": x.shape_dict[Axis.N], "embedding_dim": w.shape_dict[Axis.N] }) name_injector = KernelNameInjector(op) source = template source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def elementwise_add(op: Sgemm) -> List[Kernel]: A = op.inputs["A"] B = op.inputs["B"] C = op.outputs["C"] assert ChannelMode.get_mode(A) == ChannelMode.get_mode(B) name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "A": A, "B": B, "s_c": texture_stride(C), "d_C": [op.M, op.N], "s_C": [op.N, 1], "d_a": texture_shape(A), "s_a": texture_stride(A), "s_A": [op.K, 1] if op.transpose_A else [1, op.M], "d_b": texture_shape(B), "s_b": texture_stride(B), "s_B": [op.N, 1] if op.transpose_B else [1, op.K], "K": op.K }) source = generate_template(mode=ChannelMode.get_mode(A), K=op.K) source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel(source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, C) return [kernel]
def flatten(op: Flatten, memory_layout: MemoryLayout) -> List[Kernel]: x = memory_layout[op.inputs["x"]] y = memory_layout[op.outputs["y"]] # assert x.variable.order == y.variable.order buffer_injector = BufferInjector() buffer_injector.register({ "flatten_x": x, "flatten_y": y, "flatten_N": y.variable.size, }) name_injector = KernelNameInjector(op) source = template source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel( {name_injector.name: source}, name_injector.name, GPUSize(8, 1, 1), GPUSize(MAX_THREADS_PER_THREADGROUP, 1, 1), buffer_injector.buffer, buffer_injector.unresolved_value_list ) return [kernel]
def reinterpret_axis(op: ReinterpretAxis, memory_layout: MemoryLayout) -> List[Kernel]: x = op.inputs["x"] y = op.outputs["y"] if memory_layout[x] == memory_layout[y]: # This is inplace operation return [] assert x.order == op.parameters["in_order"] assert y.order == op.parameters["out_order"] buffer_injector = BufferInjector() buffer_injector.register({ "reinterpret_axis_x": memory_layout[x], "reinterpret_axis_y": memory_layout[y], "reinterpret_axis_N": y.size, }) name_injector = KernelNameInjector(op) source = template source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, GPUSize(8, 1, 1), GPUSize(MAX_THREADS_PER_THREADGROUP, 1, 1), buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def axiswise_bias_same_order(op: AxiswiseBias, memory_layout: MemoryLayout) -> List[Kernel]: x = memory_layout[op.inputs["x"]] b = memory_layout[op.inputs["b"]] y = memory_layout[op.outputs["y"]] target_axis_index = x.variable.order.axes_dict[op.axis] D1 = mul(x.variable.shape[:target_axis_index]) D2 = x.variable.shape[target_axis_index] D3 = mul(x.variable.shape[target_axis_index + 1:]) buffer_injector = BufferInjector() buffer_injector.register({ "axiswise_bias_X": x, "axiswise_bias_B": b, "axiswise_bias_Y": y, "axiswise_bias_D1": D1, "axiswise_bias_D2": D2, "axiswise_bias_D3": D3 }) name_injector = KernelNameInjector(op) source = generate_template_same_order(D1, D3) source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, GPUSize(8, 1, 1), GPUSize(MAX_THREADS_PER_THREADGROUP, 1, 1), buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def axiswise_scale_general(op: AxiswiseScale, memory_layout: MemoryLayout) -> List[Kernel]: x = memory_layout[op.inputs["x"]] s = memory_layout[op.inputs["s"]] y = memory_layout[op.outputs["y"]] x_shape = x.variable.shape target_axis_index = x.variable.order.axes_dict[op.axis] D1 = mul(x_shape[:target_axis_index]) D2 = x_shape[target_axis_index] D3 = mul(x_shape[target_axis_index + 1:]) y_strides = [] stride = 1 for sh in reversed(y.variable.shape): y_strides.insert(0, stride) stride *= sh x_stride_in_y = [ y_strides[y.variable.order.axes_dict[axis]] for axis in x.variable.order.axes ] buffer_injector = BufferInjector() buffer_injector.register({ "axiswise_scale_X": x, "axiswise_scale_S": s, "axiswise_scale_Y": y, "axiswise_scale_D1": D1, "axiswise_scale_D2": D2, "axiswise_scale_D3": D3, "axiswise_scale_D": x.variable.ndim, "axiswise_scale_d_target": x.variable.order.axes_dict[op.axis], "axiswise_scale_x_shape": x_shape, "axiswise_scale_x_stride_in_y": x_stride_in_y, }) name_injector = KernelNameInjector(op) source = template_general source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def tensordot(op: Tensordot, memory_layout: MemoryLayout) -> List[Kernel]: A = op.inputs["A"] B = op.inputs["B"] C = op.outputs["C"] axes = op.axes # Reduced axes must be located on inside of input variables. assert A.order.axes[-len(axes[0]):] == axes[0] assert B.order.axes[-len(axes[1]):] == axes[1] # output variable's axes order must be as [*a_remained_axes, *b_remained_axes] assert C.order.axes[:A.ndim - len(axes[0])] == A.order.axes[:-len(axes[0])] assert C.order.axes[-(B.ndim - len(axes[1])):] == B.order.axes[:-len(axes[1])] assert C.ndim == A.ndim - len(axes[0]) + B.ndim - len(axes[1]) K = mul(A.shape_dict[a] for a in axes[0]) M = A.size // K N = B.size // K buffer_injector = BufferInjector() buffer_injector.register({ "sgemm_A": memory_layout[A], "sgemm_B": memory_layout[B], "sgemm_C": memory_layout[C], "sgemm_M": M, "sgemm_N": N, "sgemm_K": K }) if op.has_attribute(UseEigenAttribute): source = generate_template_eigen(True, False) buffer_injector.register({ "sgemm_A": memory_layout[A], "sgemm_B": memory_layout[B], "sgemm_C": memory_layout[C] }) else: source = generate_template(True, False) buffer_injector.register({ "sgemm_A": memory_layout[A], "sgemm_B": memory_layout[B], "sgemm_C": memory_layout[C], "sgemm_M": op.M, "sgemm_N": op.N, "sgemm_K": op.K }) name_injector = KernelNameInjector(op) source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]