예제 #1
0
def col2im(op: Col2Im, memory_layout: MemoryLayout) -> List[Kernel]:
    col = op.inputs["col"]
    im = op.outputs["im"]

    assert col.order == Order([Axis.N, Axis.H, Axis.W, Axis.KH, Axis.KW, Axis.C])
    assert im.order == OrderNHWC

    buffer_injector = BufferInjector()
    buffer_injector.register({
        "col2im_im": memory_layout[im],
        "col2im_col": memory_layout[col],
        "col2im_N": col.shape_dict[Axis.N],
        "col2im_H2": col.shape_dict[Axis.H],
        "col2im_W2": col.shape_dict[Axis.W],
        "col2im_C1": im.shape_dict[Axis.C],
        "col2im_H1": im.shape_dict[Axis.H],
        "col2im_W1": im.shape_dict[Axis.W],
        "col2im_KH": op.KH,
        "col2im_KW": op.KW,
        "col2im_SH": op.SH,
        "col2im_SW": op.SW,
        "col2im_PH": op.PH,
        "col2im_PW": op.PW,
    })

    name_injector = KernelNameInjector(op)

    source = template
    source = buffer_injector.inject(source)
    source = name_injector.inject(source)

    kernel = Kernel(
        {name_injector.name: source},
        name_injector.name,
        buffer_injector.buffer,
        buffer_injector.unresolved_value_list
    )

    return [kernel]
예제 #2
0
def zero_padding_1d(op: ZeroPadding1D,
                    memory_layout: MemoryLayout) -> List[Kernel]:
    x = memory_layout[op.inputs["x"]]
    y = memory_layout[op.outputs["y"]]

    assert x.variable.order == OrderNTC
    assert y.variable.order == OrderNTC

    buffer_injector = BufferInjector()
    buffer_injector.register({
        "zero_padding_1d_X":
        x,
        "zero_padding_1d_Y":
        y,
        "zero_padding_1d_N":
        x.variable.shape_dict[Axis.N],
        "zero_padding_1d_T1":
        x.variable.shape_dict[Axis.T],
        "zero_padding_1d_C":
        x.variable.shape_dict[Axis.C],
        "zero_padding_1d_T2":
        y.variable.shape_dict[Axis.T],
        "zero_padding_1d_Pad1L":
        op.parameters["padding"][0],
    })
    # "zero_padding_1d_Pad1H": op.parameters["padding"][1] # unused in kernel

    name_injector = KernelNameInjector(op)

    source = template
    source = buffer_injector.inject(source)
    source = name_injector.inject(source)

    kernel = Kernel({name_injector.name: source}, name_injector.name,
                    GPUSize(8, 1, 1), GPUSize(MAX_THREADS_PER_THREADGROUP, 1,
                                              1), buffer_injector.buffer,
                    buffer_injector.unresolved_value_list)

    return [kernel]
예제 #3
0
파일: max.py 프로젝트: zhangaz1/webdnn
def max_handler(op: Max, memory_layout: MemoryLayout) -> List[Kernel]:
    x = op.inputs["x"]
    y = op.outputs["y"]

    axis = op.parameters["axis"]

    buffer_injector = BufferInjector()
    buffer_injector.register({
        "max_X":
        memory_layout[x],
        "max_Y":
        memory_layout[y],
        "max_y_stride":
        y.stride,
        "max_y_shape":
        y.shape,
        "max_x_stride": [x.stride_dict[a] for a in y.order.axes],
        "max_D":
        y.ndim,
        "max_N":
        x.shape_dict[axis],
        "max_MAX_GID":
        y.size,
        "max_x_target_axis_stride":
        x.stride_dict[axis]
    })

    name_injector = KernelNameInjector(op)

    source = template
    source = buffer_injector.inject(source)
    source = name_injector.inject(source)

    kernel = Kernel({name_injector.name: source}, name_injector.name,
                    GPUSize(8, 1, 1), GPUSize(MAX_THREADS_PER_THREADGROUP, 1,
                                              1), buffer_injector.buffer,
                    buffer_injector.unresolved_value_list)

    return [kernel]
예제 #4
0
def im2col(op: Im2Col, memory_layout: MemoryLayout) -> List[Kernel]:
    im = op.inputs["im"]
    col = op.outputs["col"]

    assert im.order == OrderNHWC
    assert col.order == OrderNHWC or col.order == OrderCNHW

    buffer_injector = BufferInjector()
    buffer_injector.register({
        "im2col_im": memory_layout[im],
        "im2col_col": memory_layout[col],
        "im2col_N": col.shape_dict[Axis.N],
        "im2col_C1": im.shape_dict[Axis.C],
        "im2col_H1": im.shape_dict[Axis.H],
        "im2col_W1": im.shape_dict[Axis.W],
        "im2col_H2": col.shape_dict[Axis.H],
        "im2col_W2": col.shape_dict[Axis.W],
        "im2col_KH": op.KH,
        "im2col_KW": op.KW,
        "im2col_DH": op.DH,
        "im2col_DW": op.DW,
        "im2col_SH": op.SH,
        "im2col_SW": op.SW,
        "im2col_PH": op.PH,
        "im2col_PW": op.PW,
    })

    name_injector = KernelNameInjector(op)

    source = template_CNHW if col.order == OrderCNHW else template_NHWC
    source = buffer_injector.inject(source)
    source = name_injector.inject(source)

    kernel = Kernel({name_injector.name: source}, name_injector.name,
                    buffer_injector.buffer,
                    buffer_injector.unresolved_value_list)

    return [kernel]
예제 #5
0
def max_pooling_2d(op: MaxPooling2D,
                   memory_layout: MemoryLayout) -> List[Kernel]:
    x = op.inputs["x"]
    y = op.outputs["y"]

    assert x.order == OrderNHWC
    assert y.order == OrderNHWC

    buffer_injector = BufferInjector()
    buffer_injector.register({
        "max_pooling_2d_X": memory_layout[x],
        "max_pooling_2d_Y": memory_layout[y],
        "max_pooling_2d_N": x.shape_dict[Axis.N],
        "max_pooling_2d_H1": x.shape_dict[Axis.H],
        "max_pooling_2d_W1": x.shape_dict[Axis.W],
        "max_pooling_2d_C": x.shape_dict[Axis.C],
        "max_pooling_2d_H2": y.shape_dict[Axis.H],
        "max_pooling_2d_W2": y.shape_dict[Axis.W],
        "max_pooling_2d_KH": op.parameters["ksize"][0],
        "max_pooling_2d_KW": op.parameters["ksize"][1],
        "max_pooling_2d_SH": op.parameters["stride"][0],
        "max_pooling_2d_SW": op.parameters["stride"][1],
        "max_pooling_2d_PH": op.parameters["padding"][0],
        "max_pooling_2d_PW": op.parameters["padding"][1],
    })

    name_injector = KernelNameInjector(op)

    source = template
    source = buffer_injector.inject(source)
    source = name_injector.inject(source)

    kernel = Kernel({name_injector.name: source}, name_injector.name,
                    GPUSize(8, 1, 1), GPUSize(MAX_THREADS_PER_THREADGROUP, 1,
                                              1), buffer_injector.buffer,
                    buffer_injector.unresolved_value_list)

    return [kernel]
예제 #6
0
def axiswise_scale_same_order(op: AxiswiseScale,
                              memory_layout: MemoryLayout) -> List[Kernel]:
    x = memory_layout[op.inputs["x"]]
    s = memory_layout[op.inputs["s"]]
    y = memory_layout[op.outputs["y"]]

    target_axis_index = x.variable.order.axes_dict[op.axis]
    D1 = mul(x.variable.shape[:target_axis_index])
    D2 = x.variable.shape[target_axis_index]
    D3 = mul(x.variable.shape[target_axis_index + 1:])

    buffer_injector = BufferInjector()
    buffer_injector.register({
        "axiswise_scale_X": x,
        "axiswise_scale_S": s,
        "axiswise_scale_Y": y,
        "axiswise_scale_D1": D1,
        "axiswise_scale_D2": D2,
        "axiswise_scale_D3": D3
    })

    name_injector = KernelNameInjector(op)

    source = generate_template_same_order(D1, D3)
    source = buffer_injector.inject(source)
    source = name_injector.inject(source)

    kernel = Kernel(
        {name_injector.name: source},
        name_injector.name,
        GPUSize(8, 1, 1),
        GPUSize(MAX_THREADS_PER_THREADGROUP, 1, 1),
        buffer_injector.buffer,
        buffer_injector.unresolved_value_list
    )

    return [kernel]
예제 #7
0
def elementwise_add(op: ScalarAffine) -> List[Kernel]:
    x0 = op.inputs["x0"]
    y = op.outputs["y"]

    shapes, strides = optimize_loop_structure([x0, y], y)

    name_injector = KernelNameInjector(op)
    uniform_injector = UniformInjector()

    uniform_injector.register({
        "X0": x0,

        "s_y": texture_stride(y),
        "d_Y": shapes[y],
        "s_Y": strides[y],

        "d_x0": texture_shape(x0),
        "s_x0": texture_stride(x0),
        "d_X0": shapes[x0],
        "s_X0": strides[x0],

        "scale": op.parameters["scale"],
        "bias": op.parameters["bias"],
    })

    source = template_R if ChannelMode.get(y) == ChannelModeEnum.R else template_RGBA
    source = uniform_injector.inject(source)
    source = name_injector.inject(source)
    kernel = Kernel(
        source,
        name_injector.name,
        uniform_injector.samplers,
        uniform_injector.uniforms,
        y
    )

    return [kernel]
예제 #8
0
파일: linear.py 프로젝트: xczhanjun/webdnn
def linear(op: Linear, memory_layout: MemoryLayout) -> List[Kernel]:
    x = memory_layout[op.inputs["x"]]
    w = memory_layout[op.inputs["w"]]
    y = memory_layout[op.outputs["y"]]

    assert x.variable.order == OrderNC or x.variable.order == OrderNHWC
    assert w.variable.order == OrderCN or w.variable.order == OrderHWCN
    assert y.variable.order == OrderNC or y.variable.order == OrderNHWC
    assert w.variable.ndim == x.variable.ndim

    buffer_injector = BufferInjector()
    buffer_injector.register({
        "linear_X":
        x,
        "linear_Y":
        y,
        "linear_W":
        w,
        "linear_M":
        y.variable.shape_dict[Axis.N],
        "linear_N":
        y.variable.size // y.variable.shape_dict[Axis.N],
        "linear_K":
        x.variable.size // x.variable.shape_dict[Axis.N],
    })

    name_injector = KernelNameInjector(op)

    source = template
    source = buffer_injector.inject(source)
    source = name_injector.inject(source)

    kernel = Kernel({name_injector.name: source}, name_injector.name,
                    buffer_injector.buffer,
                    buffer_injector.unresolved_value_list)

    return [kernel]
예제 #9
0
파일: col2im.py 프로젝트: xczhanjun/webdnn
def col2im(op: Col2Im, memory_layout: MemoryLayout) -> List[Kernel]:
    col = memory_layout[op.inputs["col"]]
    im = memory_layout[op.outputs["im"]]

    assert col.variable.order == OrderNHWC
    assert im.variable.order == OrderNHWC

    buffer_injector = BufferInjector()
    buffer_injector.register({
        "col2im_im": im,
        "col2im_col": col,
        "col2im_N": col.variable.shape_dict[Axis.N],
        "col2im_H2": col.variable.shape_dict[Axis.H],
        "col2im_W2": col.variable.shape_dict[Axis.W],
        "col2im_C1": im.variable.shape_dict[Axis.C],
        "col2im_H1": im.variable.shape_dict[Axis.H],
        "col2im_W1": im.variable.shape_dict[Axis.W],
        "col2im_KH": op.KH,
        "col2im_KW": op.KW,
        "col2im_SH": op.SH,
        "col2im_SW": op.SW,
        "col2im_PH": op.PH,
        "col2im_PW": op.PW,
    })

    name_injector = KernelNameInjector(op)

    source = template
    source = buffer_injector.inject(source)
    source = name_injector.inject(source)

    kernel = Kernel({name_injector.name: source}, name_injector.name,
                    GPUSize(8, 1, 1), GPUSize(MAX_THREADS_PER_THREADGROUP, 1,
                                              1), buffer_injector.buffer,
                    buffer_injector.unresolved_value_list)

    return [kernel]
예제 #10
0
def elementwise_add(op: LeakyRelu) -> List[Kernel]:
    x0 = op.inputs["x0"]
    y = op.outputs["y"]

    shapes, strides = optimize_loop_structure([x0, y], y)

    name_injector = KernelNameInjector(op)
    uniform_injector = UniformInjector()

    uniform_injector.register({
        "X0": x0,

        "s_y": texture_stride(y),
        "d_Y": shapes[y],
        "s_Y": strides[y],

        "d_x0": texture_shape(x0),
        "s_x0": texture_stride(x0),
        "d_X0": shapes[x0],
        "s_X0": strides[x0],

        "slope": op.parameters["slope"]
    })

    source = template
    source = uniform_injector.inject(source)
    source = name_injector.inject(source)
    kernel = Kernel(
        source,
        name_injector.name,
        uniform_injector.samplers,
        uniform_injector.uniforms,
        y
    )

    return [kernel]
예제 #11
0
def space2depth(op: Space2Depth, memory_layout: MemoryLayout) -> List[Kernel]:
    x = memory_layout[op.inputs["x"]]
    y = memory_layout[op.outputs["y"]]
    r = op.parameters['r']

    assert x.variable.order == OrderNHWC
    assert y.variable.order == OrderNHWC

    buffer_injector = BufferInjector()
    buffer_injector.register({
        "space2depth_x": x,
        "space2depth_y": y,
        'space2depth_r': r,
        "space2depth_N": x.variable.shape_dict[Axis.N],
        "space2depth_C1": x.variable.shape_dict[Axis.C],
        "space2depth_C2": y.variable.shape_dict[Axis.C],
        "space2depth_H1": x.variable.shape_dict[Axis.H],
        "space2depth_H2": y.variable.shape_dict[Axis.H],
        "space2depth_W1": x.variable.shape_dict[Axis.W],
        "space2depth_W2": y.variable.shape_dict[Axis.W],
    })

    name_injector = KernelNameInjector(op)

    source = template
    source = buffer_injector.inject(source)
    source = name_injector.inject(source)

    kernel = Kernel(
        {name_injector.name: source},
        name_injector.name,
        buffer_injector.buffer,
        buffer_injector.unresolved_value_list
    )

    return [kernel]
예제 #12
0
def Normalize(op: Normalize, memory_layout: MemoryLayout) -> List[Kernel]:
    x = op.inputs["x"]
    y = op.outputs["y"]

    assert y.order == x.order
    assert y.shape == x.shape

    axis = op.parameters["axis"]
    assert axis == x.order.axes[
        -1], "[Webassembly] Normalize supports only for aggregating last axis."

    buffer_injector = BufferInjector()
    buffer_injector.register({
        "normalize_X":
        memory_layout[x],
        "normalize_Y":
        memory_layout[y],
        "normalize_N":
        y.size // y.shape_dict[axis],
        "normalize_C":
        y.shape_dict[axis],
        "normalize_param_eps":
        float(op.parameters["eps"]),
    })

    name_injector = KernelNameInjector(op)

    source = template
    source = buffer_injector.inject(source)
    source = name_injector.inject(source)

    kernel = Kernel({name_injector.name: source}, name_injector.name,
                    buffer_injector.buffer,
                    buffer_injector.unresolved_value_list)

    return [kernel]
예제 #13
0
def tensordot(op: Tensordot, memory_layout: MemoryLayout) -> List[Kernel]:
    A = op.inputs["A"]
    B = op.inputs["B"]
    C = op.outputs["C"]
    axes = op.axes

    # Reduced axes must be located on inside of input variables.
    assert A.order.axes[-len(axes[0]):] == axes[0]
    assert B.order.axes[-len(axes[1]):] == axes[1]

    # output variable's axes order must be as [*a_remained_axes, *b_remained_axes]
    assert C.order.axes[:A.ndim - len(axes[0])] == A.order.axes[:-len(axes[0])]
    assert C.order.axes[-(B.ndim - len(axes[1])):] == B.order.axes[:-len(axes[1])]
    assert C.ndim == A.ndim - len(axes[0]) + B.ndim - len(axes[1])

    K = mul(A.shape_dict[a] for a in axes[0])
    M = A.size // K
    N = B.size // K

    buffer_injector = BufferInjector()
    buffer_injector.register({
        "A": memory_layout[A],
        "B": memory_layout[B],
        "C": memory_layout[C],
        "M": M,
        "N": N,
        "K": K
    })

    if op.has_attribute(UseEigenAttribute):
        source = generate_template_eigen(True, False)
        buffer_injector.register({
            "A": memory_layout[A],
            "B": memory_layout[B],
            "C": memory_layout[C]
        })

    else:
        source = generate_template(True, False)
        buffer_injector.register({
            "A": memory_layout[A],
            "B": memory_layout[B],
            "C": memory_layout[C],
            "M": M,
            "N": N,
            "K": K
        })

    name_injector = KernelNameInjector(op)

    source = buffer_injector.inject(source)
    source = name_injector.inject(source)

    kernel = Kernel(
        {name_injector.name: source},
        name_injector.name,
        buffer_injector.buffer,
        buffer_injector.unresolved_value_list
    )

    return [kernel]
예제 #14
0
def concat(op: Concat) -> List[Kernel]:
    xs = [op.inputs[f"x{i}"] for i in range(len(op.inputs) - 1)]
    workspace = op.inputs["workspace"]
    y = op.outputs["y"]
    axis = op.axis

    kernels = []
    sections = [0]

    for x in xs[1:]:
        sections.append(sections[-1] + x.shape_dict[axis])

    for i, x in enumerate(xs):
        assert x.order.check_same_axes(y.order)
        assert ChannelMode.get(x) == ChannelMode.get(y)

        offset = [sections[i] if a == axis else 0 for a in y.order.axes]

        name_injector = KernelNameInjector(op)
        uniform_injector = UniformInjector()
        uniform_injector.register({
            "sampler_x": x,
            "sampler_workspace": workspace,

            "texture_shape_workspace": texture_shape(workspace),

            "texture_stride_y": texture_stride(y),
            "variable_shape_y": _pad_to_4d(y.shape),
            "variable_stride_y": _pad_to_4d(y.stride),

            "texture_shape_x": texture_shape(x),
            "texture_stride_x": texture_stride(x),
            "variable_shape_x": _pad_to_4d([x.shape_dict[a] for a in y.order.axes]),
            "variable_stride_x": _pad_to_4d([x.stride_dict[a] for a in y.order.axes]),

            "offset": _pad_to_4d(offset, 0)
        })
        source = template
        source = uniform_injector.inject(source)
        source = name_injector.inject(source)
        kernel = Kernel(
            source,
            name_injector.name,
            uniform_injector.samplers,
            uniform_injector.uniforms,
            y
        )
        kernels.append(kernel)

        name_injector2 = KernelNameInjector(op)
        uniform_injector2 = UniformInjector()
        uniform_injector2.register({
            "sampler_y": y,
            "texture_shape_y": texture_shape(y),
        })
        source2 = template2
        source2 = uniform_injector2.inject(source2)
        source2 = name_injector2.inject(source2)
        kernel2 = Kernel(
            source2,
            name_injector2.name,
            uniform_injector2.samplers,
            uniform_injector2.uniforms,
            workspace
        )
        kernels.append(kernel2)

    return kernels
예제 #15
0
파일: lstm.py 프로젝트: xczhanjun/webdnn
def lstm(op: LSTM, memory_layout: MemoryLayout) -> List[Kernel]:
    x = memory_layout[op.inputs["x"]]
    b = memory_layout[op.inputs["b"]]
    y = memory_layout[op.outputs["y"]]
    x_and_h = memory_layout[op.inputs["x_and_h"]]
    w_all = memory_layout[op.inputs["w_all"]]
    workspace = memory_layout[op.inputs["workspace"]]
    final_c = memory_layout[op.outputs["final_c"]]

    use_initial_c = op.parameters["use_initial_c"]
    use_initial_h = op.parameters["use_initial_h"]
    return_sequences = op.parameters["return_sequences"]

    assert x.variable.order == OrderNTC, \
        f"Current implementation supports only OrderNTC for input variable order: x.order = {x.variable.order}"

    if return_sequences:
        assert y.variable.order == OrderNTC, f"Current implementation supports only OrderNTC for output variable of " + \
                                             f"LSTM in return_sequences=True mode: y.order = {y.variable.order}"
    else:
        assert y.variable.order == OrderNC, \
            f"Current implementation supports only OrderNC for output variable of LSTM " + \
            f"in return_sequences=False mode: y.order = {y.variable.order}"

    assert w_all.variable.order == OrderCN
    assert final_c.variable.order == OrderNC

    N = x.variable.shape_dict[Axis.N]
    T = x.variable.shape_dict[Axis.T]
    C1 = x.variable.shape_dict[Axis.C]
    C2 = y.variable.shape_dict[Axis.C]

    buffer_injector = BufferInjector()
    buffer_injector.register({
        "lstm_X": x,
        "lstm_Y": y,
        "lstm_b": b,
        "lstm_N": N,
        "lstm_T": T,
        "lstm_C1": C1,
        "lstm_C2": C2,
        "lstm_X_and_H": x_and_h,
        "lstm_W_all": w_all,
        "lstm_workspace": workspace,
        "lstm_final_C": final_c,
        "lstm_initial_C": memory_layout[op.inputs["initial_c"]] if use_initial_c else 0,
        "lstm_initial_H": memory_layout[op.inputs["initial_h"]] if use_initial_h else 0,
    })

    name_injector = KernelNameInjector(op)

    if op.parameters["activation"] == "tanh":
        activation_function = "(tanh(x))"
    else:
        raise NotImplementedError

    if op.parameters["recurrent_activation"] == "hard_sigmoid":
        recurrent_activation_function = "((x) < -2.5 ? 0.0 : ((x) > +2.5 ? 1.0 : ((x) * 0.2 + 0.5)))"
    elif op.parameters["recurrent_activation"] == "sigmoid":
        recurrent_activation_function = "(tanh(0.5f * (x)) * 0.5f + 0.5f)"
    else:
        raise NotImplementedError

    source = generate_template_general(use_initial_c, use_initial_h, return_sequences,
                                       activation_function, recurrent_activation_function)
    source = buffer_injector.inject(source)
    source = name_injector.inject(source)

    kernel = Kernel(
        {name_injector.name: source},
        name_injector.name,
        GPUSize(1, 1, 1),
        GPUSize(MAX_THREADS_PER_THREADGROUP, 1, 1),
        buffer_injector.buffer,
        buffer_injector.unresolved_value_list
    )

    return [kernel]
예제 #16
0
파일: reduce.py 프로젝트: unixnme/webdnn
def reduce_kernel(op: Reduce):
    x = op.inputs["x"]
    y = op.outputs["y"]
    axis = op.axis

    orders, shape_dicts = simplify_orders([x, y], keep_axes=[axis])

    # Padding shapes and strides to 4D
    if orders[y].ndim > 4:
        raise NotImplementedError(f"Too large number of dimension: {y}")

    shapes = {v: [shape_dicts[v][a] for a in orders[v].axes] for v in [x, y]}
    strides = {
        v:
        [mul(shapes[v][orders[v].axes_dict[a] + 1:]) for a in orders[v].axes]
        for v in [x, y]
    }
    stride_dicts = {v: AxisKeyDict(orders[v].axes, strides[v]) for v in [x, y]}

    # Change x's shapes and strides order to same as y's order
    x_virtual_shape = [
        shape_dicts[x][a] if a in orders[x].axes else 1 for a in orders[y].axes
    ]
    x_virtual_stride = [
        stride_dicts[x][a] if a in orders[x].axes else 1
        for a in orders[y].axes
    ]
    while len(x_virtual_shape) < 3:
        x_virtual_stride.append(1)
        x_virtual_shape.append(stride_dicts[x][axis])
    x_virtual_shape.append(shape_dicts[x][axis])
    x_virtual_stride.append(stride_dicts[x][axis])

    y_virtual_shape = shapes[y]
    y_virtual_stride = strides[y]
    while len(y_virtual_shape) < 4:
        y_virtual_stride.append(1)
        y_virtual_shape.append(1)

    name_injector = KernelNameInjector(op)
    uniform_injector = UniformInjector()

    uniform_injector.register({
        "texture_stride_y": texture_stride(y),
        "variable_shape_y": y_virtual_shape,
        "variable_stride_y": y_virtual_stride,
        f"sampler_x": x,
        f"texture_shape_x": texture_shape(x),
        f"texture_stride_x": texture_stride(x),
        f"variable_shape_x": x_virtual_shape,
        f"variable_stride_x": x_virtual_stride,
    })

    for name, callable in _registered_items[op.__class__].parameters.items():
        uniform_injector.register({name: callable(op)})

    # Computing logical position is required.
    source = _generate_template_convert_position(
        op, reduction_size=shape_dicts[x][axis])

    source = uniform_injector.inject(source)
    source = name_injector.inject(source)
    kernel = Kernel(source, name_injector.name, uniform_injector.samplers,
                    uniform_injector.uniforms, y)

    return [kernel]
예제 #17
0
def lstm(op: LSTM, memory_layout: MemoryLayout) -> List[Kernel]:
    x = op.inputs["x"]
    w_input = op.inputs["w_input"]
    w_hidden = op.inputs["w_hidden"]
    y = op.outputs["y"]
    final_c = op.outputs["final_c"]

    assert x.order == OrderNTC
    assert w_input.order == OrderCN
    assert w_hidden.order == OrderCN
    if op.parameters["return_sequences"]:
        assert y.order == OrderNTC
    else:
        assert y.order == OrderNC
    assert final_c.order == OrderNC

    # W is for updating i, f, c, o
    hidden_dim = w_hidden.shape_dict[Axis.C]

    buffer_injector_items = {
        "lstm_X": memory_layout[x],
        "lstm_Y": memory_layout[y],
        "lstm_final_c": memory_layout[final_c],
        "lstm_W_input": memory_layout[w_input],
        "lstm_W_hidden": memory_layout[w_hidden],
        "lstm_input_dim": x.shape_dict[Axis.C],
        "lstm_sequence_len": x.shape_dict[Axis.T],
        "lstm_batch_size": x.shape_dict[Axis.N],
        "lstm_hidden_dim": hidden_dim
    }

    source = template
    if op.parameters["return_sequences"]:
        source = source.replace("%%DEFINE_SEQUENCE_OUTPUT%%", "#define SEQUENCE_OUTPUT")
    else:
        source = source.replace("%%DEFINE_SEQUENCE_OUTPUT%%", "")

    if op.parameters["use_bias"]:
        b = op.inputs["b"]
        buffer_injector_items["lstm_b"] = memory_layout[b]
        source = source.replace("%%BIAS_INITIALIZER%%",
                                "float *b = %%LOAD_BUFFER(lstm_b)%%;\nEigen::Map<Eigen::RowVectorXf > vec_b(b, hidden_dim4);")
        source = source.replace("%%BIAS_APPLIER%%", "mat_v.rowwise() += vec_b;")
    else:
        source = source.replace("%%BIAS_INITIALIZER%%", "")
        source = source.replace("%%BIAS_APPLIER%%", "")

    if op.parameters["use_initial_c"]:
        initial_c = op.inputs["initial_c"]
        buffer_injector_items["lstm_initial_c"] = memory_layout[initial_c]
        source = source.replace("%%INITIAL_C_COPIER%%", """
        const float *initial_c = %%LOAD_BUFFER(lstm_initial_c)%%;
        for (int i = 0; i < hidden_dim * batch_size; i++) {
            mem_c[i] = initial_c[i];
        }
        """)
    else:
        source = source.replace("%%INITIAL_C_COPIER%%", """
        for (int i = 0; i < hidden_dim * batch_size; i++) {
            mem_c[i] = 0.0F;
        }
        """)

    if op.parameters["use_initial_h"]:
        initial_h = op.inputs["initial_h"]
        buffer_injector_items["lstm_initial_h"] = memory_layout[initial_h]
        source = source.replace("%%INITIAL_H_COPIER%%", """
        const float *initial_h = %%LOAD_BUFFER(lstm_initial_h)%%;
        for (int i = 0; i < hidden_dim * batch_size; i++) {
            mem_h[i] = initial_h[i];
        }
        """)
    else:
        source = source.replace("%%INITIAL_H_COPIER%%", "")

    if op.parameters["activation"] == "tanh":
        source = source.replace("%%ACTIVATION_CORE%%", """
        return tanhf(x);
        """)
    else:
        raise NotImplementedError

    if op.parameters["recurrent_activation"] == "hard_sigmoid":
        source = source.replace("%%RECURRENT_ACTIVATION_CORE%%", """
        x = x * 0.2F + 0.5F;
        if (x < 0.0F) {
            x = 0.0F;
        } else if (x > 1.0F) {
            x = 1.0F;
        }
        return x;
        """)
    elif op.parameters["recurrent_activation"] == "sigmoid":
        source = source.replace("%%RECURRENT_ACTIVATION_CORE%%", """
        x = 1.0F / (1.0 + expf(-x));
        return x;
        """)
    else:
        raise NotImplementedError

    buffer_injector = BufferInjector()
    buffer_injector.register(buffer_injector_items)

    name_injector = KernelNameInjector(op)

    source = buffer_injector.inject(source)
    source = name_injector.inject(source)

    kernel = Kernel(
        {name_injector.name: source},
        name_injector.name,
        buffer_injector.buffer,
        buffer_injector.unresolved_value_list
    )

    return [kernel]