def col2im(op: Col2Im, memory_layout: MemoryLayout) -> List[Kernel]: col = op.inputs["col"] im = op.outputs["im"] assert col.order == Order([Axis.N, Axis.H, Axis.W, Axis.KH, Axis.KW, Axis.C]) assert im.order == OrderNHWC buffer_injector = BufferInjector() buffer_injector.register({ "col2im_im": memory_layout[im], "col2im_col": memory_layout[col], "col2im_N": col.shape_dict[Axis.N], "col2im_H2": col.shape_dict[Axis.H], "col2im_W2": col.shape_dict[Axis.W], "col2im_C1": im.shape_dict[Axis.C], "col2im_H1": im.shape_dict[Axis.H], "col2im_W1": im.shape_dict[Axis.W], "col2im_KH": op.KH, "col2im_KW": op.KW, "col2im_SH": op.SH, "col2im_SW": op.SW, "col2im_PH": op.PH, "col2im_PW": op.PW, }) name_injector = KernelNameInjector(op) source = template source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel( {name_injector.name: source}, name_injector.name, buffer_injector.buffer, buffer_injector.unresolved_value_list ) return [kernel]
def zero_padding_1d(op: ZeroPadding1D, memory_layout: MemoryLayout) -> List[Kernel]: x = memory_layout[op.inputs["x"]] y = memory_layout[op.outputs["y"]] assert x.variable.order == OrderNTC assert y.variable.order == OrderNTC buffer_injector = BufferInjector() buffer_injector.register({ "zero_padding_1d_X": x, "zero_padding_1d_Y": y, "zero_padding_1d_N": x.variable.shape_dict[Axis.N], "zero_padding_1d_T1": x.variable.shape_dict[Axis.T], "zero_padding_1d_C": x.variable.shape_dict[Axis.C], "zero_padding_1d_T2": y.variable.shape_dict[Axis.T], "zero_padding_1d_Pad1L": op.parameters["padding"][0], }) # "zero_padding_1d_Pad1H": op.parameters["padding"][1] # unused in kernel name_injector = KernelNameInjector(op) source = template source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, GPUSize(8, 1, 1), GPUSize(MAX_THREADS_PER_THREADGROUP, 1, 1), buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def max_handler(op: Max, memory_layout: MemoryLayout) -> List[Kernel]: x = op.inputs["x"] y = op.outputs["y"] axis = op.parameters["axis"] buffer_injector = BufferInjector() buffer_injector.register({ "max_X": memory_layout[x], "max_Y": memory_layout[y], "max_y_stride": y.stride, "max_y_shape": y.shape, "max_x_stride": [x.stride_dict[a] for a in y.order.axes], "max_D": y.ndim, "max_N": x.shape_dict[axis], "max_MAX_GID": y.size, "max_x_target_axis_stride": x.stride_dict[axis] }) name_injector = KernelNameInjector(op) source = template source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, GPUSize(8, 1, 1), GPUSize(MAX_THREADS_PER_THREADGROUP, 1, 1), buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def im2col(op: Im2Col, memory_layout: MemoryLayout) -> List[Kernel]: im = op.inputs["im"] col = op.outputs["col"] assert im.order == OrderNHWC assert col.order == OrderNHWC or col.order == OrderCNHW buffer_injector = BufferInjector() buffer_injector.register({ "im2col_im": memory_layout[im], "im2col_col": memory_layout[col], "im2col_N": col.shape_dict[Axis.N], "im2col_C1": im.shape_dict[Axis.C], "im2col_H1": im.shape_dict[Axis.H], "im2col_W1": im.shape_dict[Axis.W], "im2col_H2": col.shape_dict[Axis.H], "im2col_W2": col.shape_dict[Axis.W], "im2col_KH": op.KH, "im2col_KW": op.KW, "im2col_DH": op.DH, "im2col_DW": op.DW, "im2col_SH": op.SH, "im2col_SW": op.SW, "im2col_PH": op.PH, "im2col_PW": op.PW, }) name_injector = KernelNameInjector(op) source = template_CNHW if col.order == OrderCNHW else template_NHWC source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def max_pooling_2d(op: MaxPooling2D, memory_layout: MemoryLayout) -> List[Kernel]: x = op.inputs["x"] y = op.outputs["y"] assert x.order == OrderNHWC assert y.order == OrderNHWC buffer_injector = BufferInjector() buffer_injector.register({ "max_pooling_2d_X": memory_layout[x], "max_pooling_2d_Y": memory_layout[y], "max_pooling_2d_N": x.shape_dict[Axis.N], "max_pooling_2d_H1": x.shape_dict[Axis.H], "max_pooling_2d_W1": x.shape_dict[Axis.W], "max_pooling_2d_C": x.shape_dict[Axis.C], "max_pooling_2d_H2": y.shape_dict[Axis.H], "max_pooling_2d_W2": y.shape_dict[Axis.W], "max_pooling_2d_KH": op.parameters["ksize"][0], "max_pooling_2d_KW": op.parameters["ksize"][1], "max_pooling_2d_SH": op.parameters["stride"][0], "max_pooling_2d_SW": op.parameters["stride"][1], "max_pooling_2d_PH": op.parameters["padding"][0], "max_pooling_2d_PW": op.parameters["padding"][1], }) name_injector = KernelNameInjector(op) source = template source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, GPUSize(8, 1, 1), GPUSize(MAX_THREADS_PER_THREADGROUP, 1, 1), buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def axiswise_scale_same_order(op: AxiswiseScale, memory_layout: MemoryLayout) -> List[Kernel]: x = memory_layout[op.inputs["x"]] s = memory_layout[op.inputs["s"]] y = memory_layout[op.outputs["y"]] target_axis_index = x.variable.order.axes_dict[op.axis] D1 = mul(x.variable.shape[:target_axis_index]) D2 = x.variable.shape[target_axis_index] D3 = mul(x.variable.shape[target_axis_index + 1:]) buffer_injector = BufferInjector() buffer_injector.register({ "axiswise_scale_X": x, "axiswise_scale_S": s, "axiswise_scale_Y": y, "axiswise_scale_D1": D1, "axiswise_scale_D2": D2, "axiswise_scale_D3": D3 }) name_injector = KernelNameInjector(op) source = generate_template_same_order(D1, D3) source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel( {name_injector.name: source}, name_injector.name, GPUSize(8, 1, 1), GPUSize(MAX_THREADS_PER_THREADGROUP, 1, 1), buffer_injector.buffer, buffer_injector.unresolved_value_list ) return [kernel]
def elementwise_add(op: ScalarAffine) -> List[Kernel]: x0 = op.inputs["x0"] y = op.outputs["y"] shapes, strides = optimize_loop_structure([x0, y], y) name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "X0": x0, "s_y": texture_stride(y), "d_Y": shapes[y], "s_Y": strides[y], "d_x0": texture_shape(x0), "s_x0": texture_stride(x0), "d_X0": shapes[x0], "s_X0": strides[x0], "scale": op.parameters["scale"], "bias": op.parameters["bias"], }) source = template_R if ChannelMode.get(y) == ChannelModeEnum.R else template_RGBA source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel( source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, y ) return [kernel]
def linear(op: Linear, memory_layout: MemoryLayout) -> List[Kernel]: x = memory_layout[op.inputs["x"]] w = memory_layout[op.inputs["w"]] y = memory_layout[op.outputs["y"]] assert x.variable.order == OrderNC or x.variable.order == OrderNHWC assert w.variable.order == OrderCN or w.variable.order == OrderHWCN assert y.variable.order == OrderNC or y.variable.order == OrderNHWC assert w.variable.ndim == x.variable.ndim buffer_injector = BufferInjector() buffer_injector.register({ "linear_X": x, "linear_Y": y, "linear_W": w, "linear_M": y.variable.shape_dict[Axis.N], "linear_N": y.variable.size // y.variable.shape_dict[Axis.N], "linear_K": x.variable.size // x.variable.shape_dict[Axis.N], }) name_injector = KernelNameInjector(op) source = template source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def col2im(op: Col2Im, memory_layout: MemoryLayout) -> List[Kernel]: col = memory_layout[op.inputs["col"]] im = memory_layout[op.outputs["im"]] assert col.variable.order == OrderNHWC assert im.variable.order == OrderNHWC buffer_injector = BufferInjector() buffer_injector.register({ "col2im_im": im, "col2im_col": col, "col2im_N": col.variable.shape_dict[Axis.N], "col2im_H2": col.variable.shape_dict[Axis.H], "col2im_W2": col.variable.shape_dict[Axis.W], "col2im_C1": im.variable.shape_dict[Axis.C], "col2im_H1": im.variable.shape_dict[Axis.H], "col2im_W1": im.variable.shape_dict[Axis.W], "col2im_KH": op.KH, "col2im_KW": op.KW, "col2im_SH": op.SH, "col2im_SW": op.SW, "col2im_PH": op.PH, "col2im_PW": op.PW, }) name_injector = KernelNameInjector(op) source = template source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, GPUSize(8, 1, 1), GPUSize(MAX_THREADS_PER_THREADGROUP, 1, 1), buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def elementwise_add(op: LeakyRelu) -> List[Kernel]: x0 = op.inputs["x0"] y = op.outputs["y"] shapes, strides = optimize_loop_structure([x0, y], y) name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "X0": x0, "s_y": texture_stride(y), "d_Y": shapes[y], "s_Y": strides[y], "d_x0": texture_shape(x0), "s_x0": texture_stride(x0), "d_X0": shapes[x0], "s_X0": strides[x0], "slope": op.parameters["slope"] }) source = template source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel( source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, y ) return [kernel]
def space2depth(op: Space2Depth, memory_layout: MemoryLayout) -> List[Kernel]: x = memory_layout[op.inputs["x"]] y = memory_layout[op.outputs["y"]] r = op.parameters['r'] assert x.variable.order == OrderNHWC assert y.variable.order == OrderNHWC buffer_injector = BufferInjector() buffer_injector.register({ "space2depth_x": x, "space2depth_y": y, 'space2depth_r': r, "space2depth_N": x.variable.shape_dict[Axis.N], "space2depth_C1": x.variable.shape_dict[Axis.C], "space2depth_C2": y.variable.shape_dict[Axis.C], "space2depth_H1": x.variable.shape_dict[Axis.H], "space2depth_H2": y.variable.shape_dict[Axis.H], "space2depth_W1": x.variable.shape_dict[Axis.W], "space2depth_W2": y.variable.shape_dict[Axis.W], }) name_injector = KernelNameInjector(op) source = template source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel( {name_injector.name: source}, name_injector.name, buffer_injector.buffer, buffer_injector.unresolved_value_list ) return [kernel]
def Normalize(op: Normalize, memory_layout: MemoryLayout) -> List[Kernel]: x = op.inputs["x"] y = op.outputs["y"] assert y.order == x.order assert y.shape == x.shape axis = op.parameters["axis"] assert axis == x.order.axes[ -1], "[Webassembly] Normalize supports only for aggregating last axis." buffer_injector = BufferInjector() buffer_injector.register({ "normalize_X": memory_layout[x], "normalize_Y": memory_layout[y], "normalize_N": y.size // y.shape_dict[axis], "normalize_C": y.shape_dict[axis], "normalize_param_eps": float(op.parameters["eps"]), }) name_injector = KernelNameInjector(op) source = template source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel({name_injector.name: source}, name_injector.name, buffer_injector.buffer, buffer_injector.unresolved_value_list) return [kernel]
def tensordot(op: Tensordot, memory_layout: MemoryLayout) -> List[Kernel]: A = op.inputs["A"] B = op.inputs["B"] C = op.outputs["C"] axes = op.axes # Reduced axes must be located on inside of input variables. assert A.order.axes[-len(axes[0]):] == axes[0] assert B.order.axes[-len(axes[1]):] == axes[1] # output variable's axes order must be as [*a_remained_axes, *b_remained_axes] assert C.order.axes[:A.ndim - len(axes[0])] == A.order.axes[:-len(axes[0])] assert C.order.axes[-(B.ndim - len(axes[1])):] == B.order.axes[:-len(axes[1])] assert C.ndim == A.ndim - len(axes[0]) + B.ndim - len(axes[1]) K = mul(A.shape_dict[a] for a in axes[0]) M = A.size // K N = B.size // K buffer_injector = BufferInjector() buffer_injector.register({ "A": memory_layout[A], "B": memory_layout[B], "C": memory_layout[C], "M": M, "N": N, "K": K }) if op.has_attribute(UseEigenAttribute): source = generate_template_eigen(True, False) buffer_injector.register({ "A": memory_layout[A], "B": memory_layout[B], "C": memory_layout[C] }) else: source = generate_template(True, False) buffer_injector.register({ "A": memory_layout[A], "B": memory_layout[B], "C": memory_layout[C], "M": M, "N": N, "K": K }) name_injector = KernelNameInjector(op) source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel( {name_injector.name: source}, name_injector.name, buffer_injector.buffer, buffer_injector.unresolved_value_list ) return [kernel]
def concat(op: Concat) -> List[Kernel]: xs = [op.inputs[f"x{i}"] for i in range(len(op.inputs) - 1)] workspace = op.inputs["workspace"] y = op.outputs["y"] axis = op.axis kernels = [] sections = [0] for x in xs[1:]: sections.append(sections[-1] + x.shape_dict[axis]) for i, x in enumerate(xs): assert x.order.check_same_axes(y.order) assert ChannelMode.get(x) == ChannelMode.get(y) offset = [sections[i] if a == axis else 0 for a in y.order.axes] name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "sampler_x": x, "sampler_workspace": workspace, "texture_shape_workspace": texture_shape(workspace), "texture_stride_y": texture_stride(y), "variable_shape_y": _pad_to_4d(y.shape), "variable_stride_y": _pad_to_4d(y.stride), "texture_shape_x": texture_shape(x), "texture_stride_x": texture_stride(x), "variable_shape_x": _pad_to_4d([x.shape_dict[a] for a in y.order.axes]), "variable_stride_x": _pad_to_4d([x.stride_dict[a] for a in y.order.axes]), "offset": _pad_to_4d(offset, 0) }) source = template source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel( source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, y ) kernels.append(kernel) name_injector2 = KernelNameInjector(op) uniform_injector2 = UniformInjector() uniform_injector2.register({ "sampler_y": y, "texture_shape_y": texture_shape(y), }) source2 = template2 source2 = uniform_injector2.inject(source2) source2 = name_injector2.inject(source2) kernel2 = Kernel( source2, name_injector2.name, uniform_injector2.samplers, uniform_injector2.uniforms, workspace ) kernels.append(kernel2) return kernels
def lstm(op: LSTM, memory_layout: MemoryLayout) -> List[Kernel]: x = memory_layout[op.inputs["x"]] b = memory_layout[op.inputs["b"]] y = memory_layout[op.outputs["y"]] x_and_h = memory_layout[op.inputs["x_and_h"]] w_all = memory_layout[op.inputs["w_all"]] workspace = memory_layout[op.inputs["workspace"]] final_c = memory_layout[op.outputs["final_c"]] use_initial_c = op.parameters["use_initial_c"] use_initial_h = op.parameters["use_initial_h"] return_sequences = op.parameters["return_sequences"] assert x.variable.order == OrderNTC, \ f"Current implementation supports only OrderNTC for input variable order: x.order = {x.variable.order}" if return_sequences: assert y.variable.order == OrderNTC, f"Current implementation supports only OrderNTC for output variable of " + \ f"LSTM in return_sequences=True mode: y.order = {y.variable.order}" else: assert y.variable.order == OrderNC, \ f"Current implementation supports only OrderNC for output variable of LSTM " + \ f"in return_sequences=False mode: y.order = {y.variable.order}" assert w_all.variable.order == OrderCN assert final_c.variable.order == OrderNC N = x.variable.shape_dict[Axis.N] T = x.variable.shape_dict[Axis.T] C1 = x.variable.shape_dict[Axis.C] C2 = y.variable.shape_dict[Axis.C] buffer_injector = BufferInjector() buffer_injector.register({ "lstm_X": x, "lstm_Y": y, "lstm_b": b, "lstm_N": N, "lstm_T": T, "lstm_C1": C1, "lstm_C2": C2, "lstm_X_and_H": x_and_h, "lstm_W_all": w_all, "lstm_workspace": workspace, "lstm_final_C": final_c, "lstm_initial_C": memory_layout[op.inputs["initial_c"]] if use_initial_c else 0, "lstm_initial_H": memory_layout[op.inputs["initial_h"]] if use_initial_h else 0, }) name_injector = KernelNameInjector(op) if op.parameters["activation"] == "tanh": activation_function = "(tanh(x))" else: raise NotImplementedError if op.parameters["recurrent_activation"] == "hard_sigmoid": recurrent_activation_function = "((x) < -2.5 ? 0.0 : ((x) > +2.5 ? 1.0 : ((x) * 0.2 + 0.5)))" elif op.parameters["recurrent_activation"] == "sigmoid": recurrent_activation_function = "(tanh(0.5f * (x)) * 0.5f + 0.5f)" else: raise NotImplementedError source = generate_template_general(use_initial_c, use_initial_h, return_sequences, activation_function, recurrent_activation_function) source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel( {name_injector.name: source}, name_injector.name, GPUSize(1, 1, 1), GPUSize(MAX_THREADS_PER_THREADGROUP, 1, 1), buffer_injector.buffer, buffer_injector.unresolved_value_list ) return [kernel]
def reduce_kernel(op: Reduce): x = op.inputs["x"] y = op.outputs["y"] axis = op.axis orders, shape_dicts = simplify_orders([x, y], keep_axes=[axis]) # Padding shapes and strides to 4D if orders[y].ndim > 4: raise NotImplementedError(f"Too large number of dimension: {y}") shapes = {v: [shape_dicts[v][a] for a in orders[v].axes] for v in [x, y]} strides = { v: [mul(shapes[v][orders[v].axes_dict[a] + 1:]) for a in orders[v].axes] for v in [x, y] } stride_dicts = {v: AxisKeyDict(orders[v].axes, strides[v]) for v in [x, y]} # Change x's shapes and strides order to same as y's order x_virtual_shape = [ shape_dicts[x][a] if a in orders[x].axes else 1 for a in orders[y].axes ] x_virtual_stride = [ stride_dicts[x][a] if a in orders[x].axes else 1 for a in orders[y].axes ] while len(x_virtual_shape) < 3: x_virtual_stride.append(1) x_virtual_shape.append(stride_dicts[x][axis]) x_virtual_shape.append(shape_dicts[x][axis]) x_virtual_stride.append(stride_dicts[x][axis]) y_virtual_shape = shapes[y] y_virtual_stride = strides[y] while len(y_virtual_shape) < 4: y_virtual_stride.append(1) y_virtual_shape.append(1) name_injector = KernelNameInjector(op) uniform_injector = UniformInjector() uniform_injector.register({ "texture_stride_y": texture_stride(y), "variable_shape_y": y_virtual_shape, "variable_stride_y": y_virtual_stride, f"sampler_x": x, f"texture_shape_x": texture_shape(x), f"texture_stride_x": texture_stride(x), f"variable_shape_x": x_virtual_shape, f"variable_stride_x": x_virtual_stride, }) for name, callable in _registered_items[op.__class__].parameters.items(): uniform_injector.register({name: callable(op)}) # Computing logical position is required. source = _generate_template_convert_position( op, reduction_size=shape_dicts[x][axis]) source = uniform_injector.inject(source) source = name_injector.inject(source) kernel = Kernel(source, name_injector.name, uniform_injector.samplers, uniform_injector.uniforms, y) return [kernel]
def lstm(op: LSTM, memory_layout: MemoryLayout) -> List[Kernel]: x = op.inputs["x"] w_input = op.inputs["w_input"] w_hidden = op.inputs["w_hidden"] y = op.outputs["y"] final_c = op.outputs["final_c"] assert x.order == OrderNTC assert w_input.order == OrderCN assert w_hidden.order == OrderCN if op.parameters["return_sequences"]: assert y.order == OrderNTC else: assert y.order == OrderNC assert final_c.order == OrderNC # W is for updating i, f, c, o hidden_dim = w_hidden.shape_dict[Axis.C] buffer_injector_items = { "lstm_X": memory_layout[x], "lstm_Y": memory_layout[y], "lstm_final_c": memory_layout[final_c], "lstm_W_input": memory_layout[w_input], "lstm_W_hidden": memory_layout[w_hidden], "lstm_input_dim": x.shape_dict[Axis.C], "lstm_sequence_len": x.shape_dict[Axis.T], "lstm_batch_size": x.shape_dict[Axis.N], "lstm_hidden_dim": hidden_dim } source = template if op.parameters["return_sequences"]: source = source.replace("%%DEFINE_SEQUENCE_OUTPUT%%", "#define SEQUENCE_OUTPUT") else: source = source.replace("%%DEFINE_SEQUENCE_OUTPUT%%", "") if op.parameters["use_bias"]: b = op.inputs["b"] buffer_injector_items["lstm_b"] = memory_layout[b] source = source.replace("%%BIAS_INITIALIZER%%", "float *b = %%LOAD_BUFFER(lstm_b)%%;\nEigen::Map<Eigen::RowVectorXf > vec_b(b, hidden_dim4);") source = source.replace("%%BIAS_APPLIER%%", "mat_v.rowwise() += vec_b;") else: source = source.replace("%%BIAS_INITIALIZER%%", "") source = source.replace("%%BIAS_APPLIER%%", "") if op.parameters["use_initial_c"]: initial_c = op.inputs["initial_c"] buffer_injector_items["lstm_initial_c"] = memory_layout[initial_c] source = source.replace("%%INITIAL_C_COPIER%%", """ const float *initial_c = %%LOAD_BUFFER(lstm_initial_c)%%; for (int i = 0; i < hidden_dim * batch_size; i++) { mem_c[i] = initial_c[i]; } """) else: source = source.replace("%%INITIAL_C_COPIER%%", """ for (int i = 0; i < hidden_dim * batch_size; i++) { mem_c[i] = 0.0F; } """) if op.parameters["use_initial_h"]: initial_h = op.inputs["initial_h"] buffer_injector_items["lstm_initial_h"] = memory_layout[initial_h] source = source.replace("%%INITIAL_H_COPIER%%", """ const float *initial_h = %%LOAD_BUFFER(lstm_initial_h)%%; for (int i = 0; i < hidden_dim * batch_size; i++) { mem_h[i] = initial_h[i]; } """) else: source = source.replace("%%INITIAL_H_COPIER%%", "") if op.parameters["activation"] == "tanh": source = source.replace("%%ACTIVATION_CORE%%", """ return tanhf(x); """) else: raise NotImplementedError if op.parameters["recurrent_activation"] == "hard_sigmoid": source = source.replace("%%RECURRENT_ACTIVATION_CORE%%", """ x = x * 0.2F + 0.5F; if (x < 0.0F) { x = 0.0F; } else if (x > 1.0F) { x = 1.0F; } return x; """) elif op.parameters["recurrent_activation"] == "sigmoid": source = source.replace("%%RECURRENT_ACTIVATION_CORE%%", """ x = 1.0F / (1.0 + expf(-x)); return x; """) else: raise NotImplementedError buffer_injector = BufferInjector() buffer_injector.register(buffer_injector_items) name_injector = KernelNameInjector(op) source = buffer_injector.inject(source) source = name_injector.inject(source) kernel = Kernel( {name_injector.name: source}, name_injector.name, buffer_injector.buffer, buffer_injector.unresolved_value_list ) return [kernel]