Exemplo n.º 1
0
def test_shader_str():
    """
    Test basic OpAlgoBase operation
    """

    shader = """
#version 450
layout(set = 0, binding = 0) buffer tensorLhs {float valuesLhs[];};
layout(set = 0, binding = 1) buffer tensorRhs {float valuesRhs[];};
layout(set = 0, binding = 2) buffer tensorOutput { float valuesOutput[];};
layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;

void main()
{
    uint index = gl_GlobalInvocationID.x;
    valuesOutput[index] = valuesLhs[index] * valuesRhs[index];
}
    """

    spirv = kp.Shader.compile_source(shader)

    mgr = kp.Manager()

    tensor_in_a = mgr.tensor([2, 2, 2])
    tensor_in_b = mgr.tensor([1, 2, 3])
    tensor_out = mgr.tensor([0, 0, 0])

    params = [tensor_in_a, tensor_in_b, tensor_out]

    algo = mgr.algorithm(params, spirv)

    (mgr.sequence().record(kp.OpTensorSyncDevice(params)).record(
        kp.OpAlgoDispatch(algo)).record(kp.OpTensorSyncLocal(params)).eval())

    assert tensor_out.data().tolist() == [2.0, 4.0, 6.0]
Exemplo n.º 2
0
def test_type_float_double_incorrect():

    shader = """
        #version 450
        layout(set = 0, binding = 0) buffer tensorLhs {float valuesLhs[];};
        layout(set = 0, binding = 1) buffer tensorRhs {float valuesRhs[];};
        layout(set = 0, binding = 2) buffer tensorOutput { float valuesOutput[];};
        layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in;

        void main()
        {
            uint index = gl_GlobalInvocationID.x;
            valuesOutput[index] = valuesLhs[index] * valuesRhs[index];
        }
    """

    spirv = compile_source(shader)

    arr_in_a = np.array([123., 153., 231.], dtype=np.float32)
    arr_in_b = np.array([9482, 1208, 1238], dtype=np.uint32)
    arr_out = np.array([0, 0, 0], dtype=np.float32)

    mgr = kp.Manager()

    tensor_in_a = mgr.tensor_t(arr_in_a)
    tensor_in_b = mgr.tensor_t(arr_in_b)
    tensor_out = mgr.tensor_t(arr_out)

    params = [tensor_in_a, tensor_in_b, tensor_out]

    (mgr.sequence().record(kp.OpTensorSyncDevice(params)).record(
        kp.OpAlgoDispatch(mgr.algorithm(params, spirv))).record(
            kp.OpTensorSyncLocal([tensor_out])).eval())

    assert np.all(tensor_out.data() != arr_in_a * arr_in_b)
Exemplo n.º 3
0
def test_pushconsts():

    spirv = kp.Shader.compile_source("""
          #version 450
          layout(push_constant) uniform PushConstants {
            float x;
            float y;
            float z;
          } pcs;
          layout (local_size_x = 1) in;
          layout(set = 0, binding = 0) buffer a { float pa[]; };
          void main() {
              pa[0] += pcs.x;
              pa[1] += pcs.y;
              pa[2] += pcs.z;
          }
    """)

    mgr = kp.Manager()

    tensor = mgr.tensor([0, 0, 0])

    algo = mgr.algorithm([tensor], spirv, (1, 1, 1))

    (mgr.sequence().record(kp.OpTensorSyncDevice([tensor])).record(
        kp.OpAlgoDispatch(algo, [0.1, 0.2, 0.3])).record(
            kp.OpAlgoDispatch(algo, [0.3, 0.2, 0.1])).record(
                kp.OpTensorSyncLocal([tensor])).eval())

    assert np.all(tensor.data() == np.array([0.4, 0.4, 0.4], dtype=np.float32))
Exemplo n.º 4
0
def test_workgroup():
    mgr = kp.Manager(0)

    tensor_a = mgr.tensor(np.zeros([16, 8]))
    tensor_b = mgr.tensor(np.zeros([16, 8]))

    @ps.python2shader
    def compute_shader_wg(gl_idx=("input", "GlobalInvocationId", ps.ivec3),
                          gl_wg_id=("input", "WorkgroupId", ps.ivec3),
                          gl_wg_num=("input", "NumWorkgroups", ps.ivec3),
                          data1=("buffer", 0, ps.Array(ps.f32)),
                          data2=("buffer", 1, ps.Array(ps.f32))):
        i = gl_wg_id.x * gl_wg_num.y + gl_wg_id.y
        data1[i] = f32(gl_idx.x)
        data2[i] = f32(gl_idx.y)

    algo = mgr.algorithm([tensor_a, tensor_b], compute_shader_wg.to_spirv(),
                         (16, 8, 1))

    (mgr.sequence().record(kp.OpTensorSyncDevice([tensor_a, tensor_b])).record(
        kp.OpAlgoDispatch(algo)).record(
            kp.OpTensorSyncLocal([tensor_a, tensor_b])).eval())

    print(tensor_a.data())
    print(tensor_b.data())

    assert np.all(tensor_a.data() == np.stack([np.arange(16)] *
                                              8, axis=1).ravel())
    assert np.all(tensor_b.data() == np.stack([np.arange(8)] *
                                              16, axis=0).ravel())
Exemplo n.º 5
0
def test_array_multiplication():

    # 1. Create Kompute Manager (selects device 0 by default)
    mgr = kp.Manager()

    # 2. Create Kompute Tensors to hold data
    tensor_in_a = mgr.tensor(np.array([2, 2, 2]))
    tensor_in_b = mgr.tensor(np.array([1, 2, 3]))
    tensor_out = mgr.tensor(np.array([0, 0, 0]))

    params = [tensor_in_a, tensor_in_b, tensor_out]

    # 4. Define the multiplication shader code to run on the GPU
    @ps.python2shader
    def compute_mult(index=("input", "GlobalInvocationId", ps.ivec3),
                     data1=("buffer", 0, ps.Array(ps.f32)),
                     data2=("buffer", 1, ps.Array(ps.f32)),
                     data3=("buffer", 2, ps.Array(ps.f32))):
        i = index.x
        data3[i] = data1[i] * data2[i]

    (mgr.sequence().record(kp.OpTensorSyncDevice(params)).record(
        kp.OpAlgoDispatch(mgr.algorithm(params,
                                        compute_mult.to_spirv()))).record(
                                            kp.OpTensorSyncLocal([tensor_out
                                                                  ])).eval())

    assert tensor_out.data().tolist() == [2.0, 4.0, 6.0]
    assert np.all(tensor_out.data() == [2.0, 4.0, 6.0])
Exemplo n.º 6
0
    def __call__(self, tensor_shape: tuple[int, int], tensor_in_1: kp.Tensor,
                 tensor_in_2: kp.Tensor, tensor_out: kp.Tensor):
        params = [tensor_in_1, tensor_in_2, tensor_out]

        if self.algo is None or self.tensor_shape != tensor_shape or self.params != params:
            self.tensor_shape = tensor_shape
            self.params = params
            local_size_x = min(self.local_size_x, tensor_shape[0])
            local_size_y = min(self.local_size_y, tensor_shape[1])
            self.compiled_shader = kp.Shader.compile_source(
                self.shader.format(local_size_x=local_size_x,
                                   local_size_y=local_size_y))
            workgroup = (tensor_shape[0] // local_size_x,
                         tensor_shape[1] // local_size_y, 1)
            print(f'{workgroup=} {self.local_size_x=} {self.local_size_y=}')
            self.algo = self.mgr.algorithm(
                params,  # params
                self.compiled_shader,  # spirv
                workgroup,  # workgroup
                [float(tensor_shape[0])],  # spec_consts
                [])  # push_consts

        (self.mgr.sequence().record(
            kp.OpTensorSyncDevice([tensor_in_1, tensor_in_2])).record(
                kp.OpAlgoDispatch(self.algo)).record(
                    kp.OpTensorSyncLocal([tensor_out])).eval())
Exemplo n.º 7
0
def test_end_to_end():

    mgr = kp.Manager()

    tensor_in_a = mgr.tensor([2, 2, 2])
    tensor_in_b = mgr.tensor([1, 2, 3])
    # Explicit type constructor supports int, in32, double, float and int
    tensor_out_a = mgr.tensor_t(np.array([0, 0, 0], dtype=np.uint32))
    tensor_out_b = mgr.tensor_t(np.array([0, 0, 0], dtype=np.uint32))

    params = [tensor_in_a, tensor_in_b, tensor_out_a, tensor_out_b]

    shader = """
        #version 450

        layout (local_size_x = 1) in;

        // The input tensors bind index is relative to index in parameter passed
        layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; };
        layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; };
        layout(set = 0, binding = 2) buffer buf_out_a { uint out_a[]; };
        layout(set = 0, binding = 3) buffer buf_out_b { uint out_b[]; };

        // Kompute supports push constants updated on dispatch
        layout(push_constant) uniform PushConstants {
            float val;
        } push_const;

        // Kompute also supports spec constants on initalization
        layout(constant_id = 0) const float const_one = 0;

        void main() {
            uint index = gl_GlobalInvocationID.x;
            out_a[index] += uint( in_a[index] * in_b[index] );
            out_b[index] += uint( const_one * push_const.val );
        }
    """

    workgroup = (3, 1, 1)
    spec_consts = [2]
    push_consts_a = [2]
    push_consts_b = [3]

    algo = mgr.algorithm(params, compile_source(shader), workgroup, spec_consts, push_consts_a)

    (mgr.sequence()
        .record(kp.OpTensorSyncDevice(params))
        .record(kp.OpAlgoDispatch(algo))
        .record(kp.OpAlgoDispatch(algo, push_consts_b))
        .eval())

    sq = mgr.sequence()
    sq.eval_async(kp.OpTensorSyncLocal(params))

    sq.eval_await()

    assert tensor_out_a.data().tolist() == [4, 8, 12]
    assert tensor_out_b.data().tolist() == [10, 10, 10]
Exemplo n.º 8
0
def main():
    mgr = kp.Manager()

    tensor_size = 4
    tensor_shape = [tensor_size, tensor_size]
    tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape)))
    tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape)))
    tensor_out = mgr.tensor(np.zeros(tensor_shape))

    print(f'Input tensors:\n'
          f'{tensor_in_1.data().reshape(tensor_shape)}\n'
          f'{tensor_in_2.data().reshape(tensor_shape)}\n')

    params = [tensor_in_1, tensor_in_2, tensor_out]

    matmul_shader = kp.Shader.compile_source('''
#version 450

layout (local_size_x = 1, local_size_y = 1) in;

layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 { float in_tensor_1[]; };
layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 { float in_tensor_2[]; };
layout (set = 0, binding = 2) writeonly buffer buf_out_tensor { float out_tensor[]; };

layout (constant_id = 0) const float tensor_size_f = 0;


void main()
{
    uint globalRow = gl_GlobalInvocationID.x;
    uint globalCol = gl_GlobalInvocationID.y;
    uint tensor_size = uint(tensor_size_f);
    float acc = 0.0;
    for(uint k = 0u; k < tensor_size; k++)
        acc += in_tensor_1[(k * tensor_size) + globalRow] * in_tensor_2[(globalCol * tensor_size) + k];
    out_tensor[(globalCol * tensor_size) + globalRow] = acc;
}''')

    algo = mgr.algorithm(
        params,  # params
        matmul_shader,  # spirv
        (*tensor_shape, 1),  # workgroup
        [float(tensor_size)],  # spec_consts
        [])  # push_consts

    (mgr.sequence().record(kp.OpTensorSyncDevice(params)).record(
        kp.OpAlgoDispatch(algo)).record(kp.OpTensorSyncLocal(params)).eval())

    print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}')
Exemplo n.º 9
0
def main():
    mgr = kp.Manager()
    for tensor_size, experiment_count in [(512, 1000), (4096, 5)]:
        tensor_shape = [tensor_size, tensor_size]
        tensor_shape = [tensor_size, tensor_size]
        mat_1 = np.triu(np.ones(tensor_shape))
        mat_2 = np.triu(np.ones(tensor_shape))

        tensor_in_1 = mgr.tensor(mat_1)
        tensor_in_2 = mgr.tensor(mat_2)
        tensor_out = mgr.tensor(np.zeros(tensor_shape))
        if tensor_size <= 512:
            mat_result = mat_1 @ mat_2
        else:
            MatMulOp1(mgr)(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
            mat_result = tensor_out.data().reshape(
                tensor_shape)  # CPU is too slow for big sizes

        print(f'{tensor_shape} input tensors:\n' f'{mat_1}\n' f'{mat_2}\n')
        print(f'Output :\n{mat_result}')

        for MatMulOp in [MatMulOp1, MatMulOp2, MatMulOp3]:
            tensor_out.data()[:] = 0
            mgr.sequence().record(kp.OpTensorSyncDevice([tensor_out]))
            matmul_op = MatMulOp(mgr)
            matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)

            start_time = time.time()
            for _ in range(experiment_count):
                matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out)
            end_time = time.time()
            experiment_time = end_time - start_time
            op_count = tensor_shape[0] * tensor_shape[1] * (
                (tensor_shape[1] * 2) - 1)

            # print(tensor_out.data().reshape(tensor_shape))
            if (tensor_out.data().reshape(tensor_shape) == mat_result).all():
                print(
                    f'From {MatMulOp.__module__} : {experiment_count} matmul time : '
                    f'{experiment_time * 1000:0.2f}ms => '
                    f'{experiment_count / experiment_time:0.2f}op/s or '
                    f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS'
                )
            else:
                print(
                    f'Test failed => output tensor is wrong :\n{tensor_out.data().reshape(tensor_shape)}'
                )
Exemplo n.º 10
0
    def __call__(self, tensor_shape: tuple[int, int], tensor_in_1: kp.Tensor,
                 tensor_in_2: kp.Tensor, tensor_out: kp.Tensor):
        params = [tensor_in_1, tensor_in_2, tensor_out]

        if self.algo is None or self.tensor_shape != tensor_shape or self.params != params:
            self.tensor_shape = tensor_shape
            self.params = params
            tile_size = min(tensor_shape[0], tensor_shape[1], self.tile_size)
            self.compiled_shader = kp.Shader.compile_source(
                self.shader.format(tile_size=tile_size))
            workgroup = [
                tensor_shape[0] // tile_size, tensor_shape[1] // tile_size, 1
            ]
            self.algo = self.mgr.algorithm(
                params,  # params
                self.compiled_shader,  # spirv
                workgroup,  # workgroup
                [float(tensor_shape[0])],  # spec_consts
                [])  # push_consts

        (self.mgr.sequence().record(
            kp.OpTensorSyncDevice([tensor_in_1, tensor_in_2])).record(
                kp.OpAlgoDispatch(self.algo)).record(
                    kp.OpTensorSyncLocal([tensor_out])).eval())
Exemplo n.º 11
0
def test_pushconsts_int():

    spirv = compile_source("""
          #version 450
          layout(push_constant) uniform PushConstants {
            int x;
            int  y;
            int  z;
          } pcs;
          layout (local_size_x = 1) in;
          layout(set = 0, binding = 0) buffer a { int  pa[]; };
          void main() {
              pa[0] += pcs.x;
              pa[1] += pcs.y;
              pa[2] += pcs.z;
          }
    """)

    mgr = kp.Manager()

    tensor = mgr.tensor_t(np.array([0, 0, 0], dtype=np.int32))

    spec_consts = np.array([], dtype=np.int32)
    push_consts = np.array([-1, -1, -1], dtype=np.int32)

    algo = mgr.algorithm([tensor], spirv, (1, 1, 1), spec_consts, push_consts)

    (mgr.sequence()
        .record(kp.OpTensorSyncDevice([tensor]))
        .record(kp.OpAlgoDispatch(algo))
        .record(kp.OpAlgoDispatch(algo, np.array([-1, -1, -1], dtype=np.int32)))
        .record(kp.OpAlgoDispatch(algo, np.array([-1, -1, -1], dtype=np.int32)))
        .record(kp.OpTensorSyncLocal([tensor]))
        .eval())

    assert np.all(tensor.data() == np.array([-3, -3, -3], dtype=np.int32))
def test_logistic_regression():
    @ps.python2shader
    def compute_shader(index=("input", "GlobalInvocationId", ps.ivec3),
                       x_i=("buffer", 0, ps.Array(ps.f32)),
                       x_j=("buffer", 1, ps.Array(ps.f32)),
                       y=("buffer", 2, ps.Array(ps.f32)),
                       w_in=("buffer", 3, ps.Array(ps.f32)),
                       w_out_i=("buffer", 4, ps.Array(ps.f32)),
                       w_out_j=("buffer", 5, ps.Array(ps.f32)),
                       b_in=("buffer", 6, ps.Array(ps.f32)),
                       b_out=("buffer", 7, ps.Array(ps.f32)),
                       l_out=("buffer", 8, ps.Array(ps.f32)),
                       M=("buffer", 9, ps.Array(ps.f32))):

        i = index.x

        m = M[0]

        w_curr = vec2(w_in[0], w_in[1])
        b_curr = b_in[0]

        x_curr = vec2(x_i[i], x_j[i])
        y_curr = y[i]

        z_dot = w_curr @ x_curr
        z = z_dot + b_curr
        y_hat = 1.0 / (1.0 + exp(-z))

        d_z = y_hat - y_curr
        d_w = (1.0 / m) * x_curr * d_z
        d_b = (1.0 / m) * d_z

        loss = -((y_curr * log(y_hat)) + ((1.0 + y_curr) * log(1.0 - y_hat)))

        w_out_i[i] = d_w.x
        w_out_j[i] = d_w.y
        b_out[i] = d_b
        l_out[i] = loss

    mgr = kp.Manager(0)

    # First we create input and ouput tensors for shader
    tensor_x_i = mgr.tensor(np.array([0.0, 1.0, 1.0, 1.0, 1.0]))
    tensor_x_j = mgr.tensor(np.array([0.0, 0.0, 0.0, 1.0, 1.0]))

    tensor_y = mgr.tensor(np.array([0.0, 0.0, 0.0, 1.0, 1.0]))

    tensor_w_in = mgr.tensor(np.array([0.001, 0.001]))
    tensor_w_out_i = mgr.tensor(np.array([0.0, 0.0, 0.0, 0.0, 0.0]))
    tensor_w_out_j = mgr.tensor(np.array([0.0, 0.0, 0.0, 0.0, 0.0]))

    tensor_b_in = mgr.tensor(np.array([0.0]))
    tensor_b_out = mgr.tensor(np.array([0.0, 0.0, 0.0, 0.0, 0.0]))

    tensor_l_out = mgr.tensor(np.array([0.0, 0.0, 0.0, 0.0, 0.0]))

    tensor_m = mgr.tensor(np.array([tensor_y.size()]))

    # We store them in an array for easier interaction
    params = [
        tensor_x_i, tensor_x_j, tensor_y, tensor_w_in, tensor_w_out_i,
        tensor_w_out_j, tensor_b_in, tensor_b_out, tensor_l_out, tensor_m
    ]

    mgr.sequence().eval(kp.OpTensorSyncDevice(params))

    # Create a managed sequence
    sq = mgr.sequence()

    # Record operation to sync memory from local to GPU memory
    sq.record(kp.OpTensorSyncDevice([tensor_w_in, tensor_b_in]))

    # Record operation to execute GPU shader against all our parameters
    sq.record(
        kp.OpAlgoDispatch(mgr.algorithm(params, compute_shader.to_spirv())))

    # Record operation to sync memory from GPU to local memory
    sq.record(
        kp.OpTensorSyncLocal(
            [tensor_w_out_i, tensor_w_out_j, tensor_b_out, tensor_l_out]))

    ITERATIONS = 100
    learning_rate = 0.1

    # Perform machine learning training and inference across all input X and Y
    for i_iter in range(ITERATIONS):

        # Execute an iteration of the algorithm
        sq.eval()

        # Calculate the parameters based on the respective derivatives calculated
        for j_iter in range(tensor_b_out.size()):
            tensor_w_in.data(
            )[0] -= learning_rate * tensor_w_out_i.data()[j_iter]
            tensor_w_in.data(
            )[1] -= learning_rate * tensor_w_out_j.data()[j_iter]
            tensor_b_in.data(
            )[0] -= learning_rate * tensor_b_out.data()[j_iter]

    assert tensor_w_in.data()[0] < 0.01
    assert tensor_w_in.data()[0] > 0.0
    assert tensor_w_in.data()[1] > 1.5
    assert tensor_b_in.data()[0] < 0.7
Exemplo n.º 13
0
            tensor_out_cg
        ],
        # push_consts
        [])

    print("Step complexity " + str(workgroup))
    print("Step channel layout " + str(tensor_in_cg) + " " +
          str(tensor_out_cg))

    # Do this first. Keep in mind "syncs" are copies.
    last_seq = kpm.sequence()
    things_to_sync_to_device = [bias, weight]
    if i == 0:
        # For first layer, the input isn't on-device yet
        things_to_sync_to_device.append(tensor_in)
    last_seq.eval_async(kp.OpTensorSyncDevice(things_to_sync_to_device))
    last_seq.eval_await()

    # Prepare
    seq = (kpm.sequence().record(kp.OpAlgoDispatch(alg, [])))
    # Run
    seq.eval()

    print("Done with step")

    if False:
        # DEBUG:
        # We want to see the output, copy it to local
        last_seq = kpm.sequence()
        last_seq.eval_async(kp.OpTensorSyncLocal([tensor_out]))
        last_seq.eval_await()