def test_shader_str(): """ Test basic OpAlgoBase operation """ shader = """ #version 450 layout(set = 0, binding = 0) buffer tensorLhs {float valuesLhs[];}; layout(set = 0, binding = 1) buffer tensorRhs {float valuesRhs[];}; layout(set = 0, binding = 2) buffer tensorOutput { float valuesOutput[];}; layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in; void main() { uint index = gl_GlobalInvocationID.x; valuesOutput[index] = valuesLhs[index] * valuesRhs[index]; } """ spirv = kp.Shader.compile_source(shader) mgr = kp.Manager() tensor_in_a = mgr.tensor([2, 2, 2]) tensor_in_b = mgr.tensor([1, 2, 3]) tensor_out = mgr.tensor([0, 0, 0]) params = [tensor_in_a, tensor_in_b, tensor_out] algo = mgr.algorithm(params, spirv) (mgr.sequence().record(kp.OpTensorSyncDevice(params)).record( kp.OpAlgoDispatch(algo)).record(kp.OpTensorSyncLocal(params)).eval()) assert tensor_out.data().tolist() == [2.0, 4.0, 6.0]
def test_type_float_double_incorrect(): shader = """ #version 450 layout(set = 0, binding = 0) buffer tensorLhs {float valuesLhs[];}; layout(set = 0, binding = 1) buffer tensorRhs {float valuesRhs[];}; layout(set = 0, binding = 2) buffer tensorOutput { float valuesOutput[];}; layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in; void main() { uint index = gl_GlobalInvocationID.x; valuesOutput[index] = valuesLhs[index] * valuesRhs[index]; } """ spirv = compile_source(shader) arr_in_a = np.array([123., 153., 231.], dtype=np.float32) arr_in_b = np.array([9482, 1208, 1238], dtype=np.uint32) arr_out = np.array([0, 0, 0], dtype=np.float32) mgr = kp.Manager() tensor_in_a = mgr.tensor_t(arr_in_a) tensor_in_b = mgr.tensor_t(arr_in_b) tensor_out = mgr.tensor_t(arr_out) params = [tensor_in_a, tensor_in_b, tensor_out] (mgr.sequence().record(kp.OpTensorSyncDevice(params)).record( kp.OpAlgoDispatch(mgr.algorithm(params, spirv))).record( kp.OpTensorSyncLocal([tensor_out])).eval()) assert np.all(tensor_out.data() != arr_in_a * arr_in_b)
def test_pushconsts(): spirv = kp.Shader.compile_source(""" #version 450 layout(push_constant) uniform PushConstants { float x; float y; float z; } pcs; layout (local_size_x = 1) in; layout(set = 0, binding = 0) buffer a { float pa[]; }; void main() { pa[0] += pcs.x; pa[1] += pcs.y; pa[2] += pcs.z; } """) mgr = kp.Manager() tensor = mgr.tensor([0, 0, 0]) algo = mgr.algorithm([tensor], spirv, (1, 1, 1)) (mgr.sequence().record(kp.OpTensorSyncDevice([tensor])).record( kp.OpAlgoDispatch(algo, [0.1, 0.2, 0.3])).record( kp.OpAlgoDispatch(algo, [0.3, 0.2, 0.1])).record( kp.OpTensorSyncLocal([tensor])).eval()) assert np.all(tensor.data() == np.array([0.4, 0.4, 0.4], dtype=np.float32))
def test_workgroup(): mgr = kp.Manager(0) tensor_a = mgr.tensor(np.zeros([16, 8])) tensor_b = mgr.tensor(np.zeros([16, 8])) @ps.python2shader def compute_shader_wg(gl_idx=("input", "GlobalInvocationId", ps.ivec3), gl_wg_id=("input", "WorkgroupId", ps.ivec3), gl_wg_num=("input", "NumWorkgroups", ps.ivec3), data1=("buffer", 0, ps.Array(ps.f32)), data2=("buffer", 1, ps.Array(ps.f32))): i = gl_wg_id.x * gl_wg_num.y + gl_wg_id.y data1[i] = f32(gl_idx.x) data2[i] = f32(gl_idx.y) algo = mgr.algorithm([tensor_a, tensor_b], compute_shader_wg.to_spirv(), (16, 8, 1)) (mgr.sequence().record(kp.OpTensorSyncDevice([tensor_a, tensor_b])).record( kp.OpAlgoDispatch(algo)).record( kp.OpTensorSyncLocal([tensor_a, tensor_b])).eval()) print(tensor_a.data()) print(tensor_b.data()) assert np.all(tensor_a.data() == np.stack([np.arange(16)] * 8, axis=1).ravel()) assert np.all(tensor_b.data() == np.stack([np.arange(8)] * 16, axis=0).ravel())
def test_array_multiplication(): # 1. Create Kompute Manager (selects device 0 by default) mgr = kp.Manager() # 2. Create Kompute Tensors to hold data tensor_in_a = mgr.tensor(np.array([2, 2, 2])) tensor_in_b = mgr.tensor(np.array([1, 2, 3])) tensor_out = mgr.tensor(np.array([0, 0, 0])) params = [tensor_in_a, tensor_in_b, tensor_out] # 4. Define the multiplication shader code to run on the GPU @ps.python2shader def compute_mult(index=("input", "GlobalInvocationId", ps.ivec3), data1=("buffer", 0, ps.Array(ps.f32)), data2=("buffer", 1, ps.Array(ps.f32)), data3=("buffer", 2, ps.Array(ps.f32))): i = index.x data3[i] = data1[i] * data2[i] (mgr.sequence().record(kp.OpTensorSyncDevice(params)).record( kp.OpAlgoDispatch(mgr.algorithm(params, compute_mult.to_spirv()))).record( kp.OpTensorSyncLocal([tensor_out ])).eval()) assert tensor_out.data().tolist() == [2.0, 4.0, 6.0] assert np.all(tensor_out.data() == [2.0, 4.0, 6.0])
def __call__(self, tensor_shape: tuple[int, int], tensor_in_1: kp.Tensor, tensor_in_2: kp.Tensor, tensor_out: kp.Tensor): params = [tensor_in_1, tensor_in_2, tensor_out] if self.algo is None or self.tensor_shape != tensor_shape or self.params != params: self.tensor_shape = tensor_shape self.params = params local_size_x = min(self.local_size_x, tensor_shape[0]) local_size_y = min(self.local_size_y, tensor_shape[1]) self.compiled_shader = kp.Shader.compile_source( self.shader.format(local_size_x=local_size_x, local_size_y=local_size_y)) workgroup = (tensor_shape[0] // local_size_x, tensor_shape[1] // local_size_y, 1) print(f'{workgroup=} {self.local_size_x=} {self.local_size_y=}') self.algo = self.mgr.algorithm( params, # params self.compiled_shader, # spirv workgroup, # workgroup [float(tensor_shape[0])], # spec_consts []) # push_consts (self.mgr.sequence().record( kp.OpTensorSyncDevice([tensor_in_1, tensor_in_2])).record( kp.OpAlgoDispatch(self.algo)).record( kp.OpTensorSyncLocal([tensor_out])).eval())
def test_end_to_end(): mgr = kp.Manager() tensor_in_a = mgr.tensor([2, 2, 2]) tensor_in_b = mgr.tensor([1, 2, 3]) # Explicit type constructor supports int, in32, double, float and int tensor_out_a = mgr.tensor_t(np.array([0, 0, 0], dtype=np.uint32)) tensor_out_b = mgr.tensor_t(np.array([0, 0, 0], dtype=np.uint32)) params = [tensor_in_a, tensor_in_b, tensor_out_a, tensor_out_b] shader = """ #version 450 layout (local_size_x = 1) in; // The input tensors bind index is relative to index in parameter passed layout(set = 0, binding = 0) buffer buf_in_a { float in_a[]; }; layout(set = 0, binding = 1) buffer buf_in_b { float in_b[]; }; layout(set = 0, binding = 2) buffer buf_out_a { uint out_a[]; }; layout(set = 0, binding = 3) buffer buf_out_b { uint out_b[]; }; // Kompute supports push constants updated on dispatch layout(push_constant) uniform PushConstants { float val; } push_const; // Kompute also supports spec constants on initalization layout(constant_id = 0) const float const_one = 0; void main() { uint index = gl_GlobalInvocationID.x; out_a[index] += uint( in_a[index] * in_b[index] ); out_b[index] += uint( const_one * push_const.val ); } """ workgroup = (3, 1, 1) spec_consts = [2] push_consts_a = [2] push_consts_b = [3] algo = mgr.algorithm(params, compile_source(shader), workgroup, spec_consts, push_consts_a) (mgr.sequence() .record(kp.OpTensorSyncDevice(params)) .record(kp.OpAlgoDispatch(algo)) .record(kp.OpAlgoDispatch(algo, push_consts_b)) .eval()) sq = mgr.sequence() sq.eval_async(kp.OpTensorSyncLocal(params)) sq.eval_await() assert tensor_out_a.data().tolist() == [4, 8, 12] assert tensor_out_b.data().tolist() == [10, 10, 10]
def main(): mgr = kp.Manager() tensor_size = 4 tensor_shape = [tensor_size, tensor_size] tensor_in_1 = mgr.tensor(np.triu(np.ones(tensor_shape))) tensor_in_2 = mgr.tensor(np.triu(np.ones(tensor_shape))) tensor_out = mgr.tensor(np.zeros(tensor_shape)) print(f'Input tensors:\n' f'{tensor_in_1.data().reshape(tensor_shape)}\n' f'{tensor_in_2.data().reshape(tensor_shape)}\n') params = [tensor_in_1, tensor_in_2, tensor_out] matmul_shader = kp.Shader.compile_source(''' #version 450 layout (local_size_x = 1, local_size_y = 1) in; layout (set = 0, binding = 0) readonly buffer buf_in_tensor_1 { float in_tensor_1[]; }; layout (set = 0, binding = 1) readonly buffer buf_in_tensor_2 { float in_tensor_2[]; }; layout (set = 0, binding = 2) writeonly buffer buf_out_tensor { float out_tensor[]; }; layout (constant_id = 0) const float tensor_size_f = 0; void main() { uint globalRow = gl_GlobalInvocationID.x; uint globalCol = gl_GlobalInvocationID.y; uint tensor_size = uint(tensor_size_f); float acc = 0.0; for(uint k = 0u; k < tensor_size; k++) acc += in_tensor_1[(k * tensor_size) + globalRow] * in_tensor_2[(globalCol * tensor_size) + k]; out_tensor[(globalCol * tensor_size) + globalRow] = acc; }''') algo = mgr.algorithm( params, # params matmul_shader, # spirv (*tensor_shape, 1), # workgroup [float(tensor_size)], # spec_consts []) # push_consts (mgr.sequence().record(kp.OpTensorSyncDevice(params)).record( kp.OpAlgoDispatch(algo)).record(kp.OpTensorSyncLocal(params)).eval()) print(f'Output :\n{tensor_out.data().reshape(tensor_shape)}')
def main(): mgr = kp.Manager() for tensor_size, experiment_count in [(512, 1000), (4096, 5)]: tensor_shape = [tensor_size, tensor_size] tensor_shape = [tensor_size, tensor_size] mat_1 = np.triu(np.ones(tensor_shape)) mat_2 = np.triu(np.ones(tensor_shape)) tensor_in_1 = mgr.tensor(mat_1) tensor_in_2 = mgr.tensor(mat_2) tensor_out = mgr.tensor(np.zeros(tensor_shape)) if tensor_size <= 512: mat_result = mat_1 @ mat_2 else: MatMulOp1(mgr)(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) mat_result = tensor_out.data().reshape( tensor_shape) # CPU is too slow for big sizes print(f'{tensor_shape} input tensors:\n' f'{mat_1}\n' f'{mat_2}\n') print(f'Output :\n{mat_result}') for MatMulOp in [MatMulOp1, MatMulOp2, MatMulOp3]: tensor_out.data()[:] = 0 mgr.sequence().record(kp.OpTensorSyncDevice([tensor_out])) matmul_op = MatMulOp(mgr) matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) start_time = time.time() for _ in range(experiment_count): matmul_op(tensor_shape, tensor_in_1, tensor_in_2, tensor_out) end_time = time.time() experiment_time = end_time - start_time op_count = tensor_shape[0] * tensor_shape[1] * ( (tensor_shape[1] * 2) - 1) # print(tensor_out.data().reshape(tensor_shape)) if (tensor_out.data().reshape(tensor_shape) == mat_result).all(): print( f'From {MatMulOp.__module__} : {experiment_count} matmul time : ' f'{experiment_time * 1000:0.2f}ms => ' f'{experiment_count / experiment_time:0.2f}op/s or ' f'{experiment_count * op_count / (1e9 * experiment_time):0.2f} GFLOPS' ) else: print( f'Test failed => output tensor is wrong :\n{tensor_out.data().reshape(tensor_shape)}' )
def __call__(self, tensor_shape: tuple[int, int], tensor_in_1: kp.Tensor, tensor_in_2: kp.Tensor, tensor_out: kp.Tensor): params = [tensor_in_1, tensor_in_2, tensor_out] if self.algo is None or self.tensor_shape != tensor_shape or self.params != params: self.tensor_shape = tensor_shape self.params = params tile_size = min(tensor_shape[0], tensor_shape[1], self.tile_size) self.compiled_shader = kp.Shader.compile_source( self.shader.format(tile_size=tile_size)) workgroup = [ tensor_shape[0] // tile_size, tensor_shape[1] // tile_size, 1 ] self.algo = self.mgr.algorithm( params, # params self.compiled_shader, # spirv workgroup, # workgroup [float(tensor_shape[0])], # spec_consts []) # push_consts (self.mgr.sequence().record( kp.OpTensorSyncDevice([tensor_in_1, tensor_in_2])).record( kp.OpAlgoDispatch(self.algo)).record( kp.OpTensorSyncLocal([tensor_out])).eval())
def test_pushconsts_int(): spirv = compile_source(""" #version 450 layout(push_constant) uniform PushConstants { int x; int y; int z; } pcs; layout (local_size_x = 1) in; layout(set = 0, binding = 0) buffer a { int pa[]; }; void main() { pa[0] += pcs.x; pa[1] += pcs.y; pa[2] += pcs.z; } """) mgr = kp.Manager() tensor = mgr.tensor_t(np.array([0, 0, 0], dtype=np.int32)) spec_consts = np.array([], dtype=np.int32) push_consts = np.array([-1, -1, -1], dtype=np.int32) algo = mgr.algorithm([tensor], spirv, (1, 1, 1), spec_consts, push_consts) (mgr.sequence() .record(kp.OpTensorSyncDevice([tensor])) .record(kp.OpAlgoDispatch(algo)) .record(kp.OpAlgoDispatch(algo, np.array([-1, -1, -1], dtype=np.int32))) .record(kp.OpAlgoDispatch(algo, np.array([-1, -1, -1], dtype=np.int32))) .record(kp.OpTensorSyncLocal([tensor])) .eval()) assert np.all(tensor.data() == np.array([-3, -3, -3], dtype=np.int32))
def test_logistic_regression(): @ps.python2shader def compute_shader(index=("input", "GlobalInvocationId", ps.ivec3), x_i=("buffer", 0, ps.Array(ps.f32)), x_j=("buffer", 1, ps.Array(ps.f32)), y=("buffer", 2, ps.Array(ps.f32)), w_in=("buffer", 3, ps.Array(ps.f32)), w_out_i=("buffer", 4, ps.Array(ps.f32)), w_out_j=("buffer", 5, ps.Array(ps.f32)), b_in=("buffer", 6, ps.Array(ps.f32)), b_out=("buffer", 7, ps.Array(ps.f32)), l_out=("buffer", 8, ps.Array(ps.f32)), M=("buffer", 9, ps.Array(ps.f32))): i = index.x m = M[0] w_curr = vec2(w_in[0], w_in[1]) b_curr = b_in[0] x_curr = vec2(x_i[i], x_j[i]) y_curr = y[i] z_dot = w_curr @ x_curr z = z_dot + b_curr y_hat = 1.0 / (1.0 + exp(-z)) d_z = y_hat - y_curr d_w = (1.0 / m) * x_curr * d_z d_b = (1.0 / m) * d_z loss = -((y_curr * log(y_hat)) + ((1.0 + y_curr) * log(1.0 - y_hat))) w_out_i[i] = d_w.x w_out_j[i] = d_w.y b_out[i] = d_b l_out[i] = loss mgr = kp.Manager(0) # First we create input and ouput tensors for shader tensor_x_i = mgr.tensor(np.array([0.0, 1.0, 1.0, 1.0, 1.0])) tensor_x_j = mgr.tensor(np.array([0.0, 0.0, 0.0, 1.0, 1.0])) tensor_y = mgr.tensor(np.array([0.0, 0.0, 0.0, 1.0, 1.0])) tensor_w_in = mgr.tensor(np.array([0.001, 0.001])) tensor_w_out_i = mgr.tensor(np.array([0.0, 0.0, 0.0, 0.0, 0.0])) tensor_w_out_j = mgr.tensor(np.array([0.0, 0.0, 0.0, 0.0, 0.0])) tensor_b_in = mgr.tensor(np.array([0.0])) tensor_b_out = mgr.tensor(np.array([0.0, 0.0, 0.0, 0.0, 0.0])) tensor_l_out = mgr.tensor(np.array([0.0, 0.0, 0.0, 0.0, 0.0])) tensor_m = mgr.tensor(np.array([tensor_y.size()])) # We store them in an array for easier interaction params = [ tensor_x_i, tensor_x_j, tensor_y, tensor_w_in, tensor_w_out_i, tensor_w_out_j, tensor_b_in, tensor_b_out, tensor_l_out, tensor_m ] mgr.sequence().eval(kp.OpTensorSyncDevice(params)) # Create a managed sequence sq = mgr.sequence() # Record operation to sync memory from local to GPU memory sq.record(kp.OpTensorSyncDevice([tensor_w_in, tensor_b_in])) # Record operation to execute GPU shader against all our parameters sq.record( kp.OpAlgoDispatch(mgr.algorithm(params, compute_shader.to_spirv()))) # Record operation to sync memory from GPU to local memory sq.record( kp.OpTensorSyncLocal( [tensor_w_out_i, tensor_w_out_j, tensor_b_out, tensor_l_out])) ITERATIONS = 100 learning_rate = 0.1 # Perform machine learning training and inference across all input X and Y for i_iter in range(ITERATIONS): # Execute an iteration of the algorithm sq.eval() # Calculate the parameters based on the respective derivatives calculated for j_iter in range(tensor_b_out.size()): tensor_w_in.data( )[0] -= learning_rate * tensor_w_out_i.data()[j_iter] tensor_w_in.data( )[1] -= learning_rate * tensor_w_out_j.data()[j_iter] tensor_b_in.data( )[0] -= learning_rate * tensor_b_out.data()[j_iter] assert tensor_w_in.data()[0] < 0.01 assert tensor_w_in.data()[0] > 0.0 assert tensor_w_in.data()[1] > 1.5 assert tensor_b_in.data()[0] < 0.7
tensor_out_cg ], # push_consts []) print("Step complexity " + str(workgroup)) print("Step channel layout " + str(tensor_in_cg) + " " + str(tensor_out_cg)) # Do this first. Keep in mind "syncs" are copies. last_seq = kpm.sequence() things_to_sync_to_device = [bias, weight] if i == 0: # For first layer, the input isn't on-device yet things_to_sync_to_device.append(tensor_in) last_seq.eval_async(kp.OpTensorSyncDevice(things_to_sync_to_device)) last_seq.eval_await() # Prepare seq = (kpm.sequence().record(kp.OpAlgoDispatch(alg, []))) # Run seq.eval() print("Done with step") if False: # DEBUG: # We want to see the output, copy it to local last_seq = kpm.sequence() last_seq.eval_async(kp.OpTensorSyncLocal([tensor_out])) last_seq.eval_await()