def test_type_unsigned_int(): shader = """ #version 450 layout(set = 0, binding = 0) buffer tensorLhs { uint valuesLhs[]; }; layout(set = 0, binding = 1) buffer tensorRhs { uint valuesRhs[]; }; layout(set = 0, binding = 2) buffer tensorOutput { uint valuesOutput[]; }; layout (local_size_x = 1, local_size_y = 1, local_size_z = 1) in; void main() { uint index = gl_GlobalInvocationID.x; valuesOutput[index] = valuesLhs[index] * valuesRhs[index]; } """ spirv = compile_source(shader) arr_in_a = np.array([123, 153, 231], dtype=np.uint32) arr_in_b = np.array([9482, 1208, 1238], dtype=np.uint32) arr_out = np.array([0, 0, 0], dtype=np.uint32) mgr = kp.Manager() tensor_in_a = mgr.tensor_t(arr_in_a) tensor_in_b = mgr.tensor_t(arr_in_b) tensor_out = mgr.tensor_t(arr_out) params = [tensor_in_a, tensor_in_b, tensor_out] (mgr.sequence().record(kp.OpTensorSyncDevice(params)).record( kp.OpAlgoDispatch(mgr.algorithm(params, spirv))).record( kp.OpTensorSyncLocal([tensor_out])).eval()) print(f"Dtype value {tensor_out.data().dtype}") assert np.all(tensor_out.data() == arr_in_a * arr_in_b)
def test_workgroup(): mgr = kp.Manager(0) tensor_a = kp.Tensor(np.zeros([16, 8])) tensor_b = kp.Tensor(np.zeros([16, 8])) mgr.eval_tensor_create_def([tensor_a, tensor_b]) shader_src = """ #version 450 layout (local_size_x = 1) in; // The input tensors bind index is relative to index in parameter passed layout(set = 0, binding = 0) writeonly buffer bout { float toutx[]; }; layout(set = 0, binding = 1) writeonly buffer bout2 { float touty[]; }; void main() { uint index = gl_WorkGroupID.x*gl_NumWorkGroups.y + gl_WorkGroupID.y; toutx[index] = gl_GlobalInvocationID.x; touty[index] = gl_GlobalInvocationID.y; } """ shader_src = bytes(shader_src, encoding='utf8') seq = mgr.create_sequence() seq.begin() seq.record_algo_data([tensor_a, tensor_b], shader_src, (16, 8, 1)) seq.end() seq.eval() mgr.eval_tensor_sync_local_def([tensor_a, tensor_b]) assert np.all(tensor_a.numpy() == np.stack([np.arange(16)] * 8, axis=1).ravel()) assert np.all(tensor_b.numpy() == np.stack([np.arange(8)] * 16, axis=0).ravel())
def test_pushconsts_int(): spirv = compile_source(""" #version 450 layout(push_constant) uniform PushConstants { int x; int y; int z; } pcs; layout (local_size_x = 1) in; layout(set = 0, binding = 0) buffer a { int pa[]; }; void main() { pa[0] += pcs.x; pa[1] += pcs.y; pa[2] += pcs.z; } """) mgr = kp.Manager() tensor = mgr.tensor_t(np.array([0, 0, 0], dtype=np.int32)) spec_consts = np.array([], dtype=np.int32) push_consts = np.array([-1, -1, -1], dtype=np.int32) algo = mgr.algorithm([tensor], spirv, (1, 1, 1), spec_consts, push_consts) (mgr.sequence() .record(kp.OpTensorSyncDevice([tensor])) .record(kp.OpAlgoDispatch(algo)) .record(kp.OpAlgoDispatch(algo, np.array([-1, -1, -1], dtype=np.int32))) .record(kp.OpAlgoDispatch(algo, np.array([-1, -1, -1], dtype=np.int32))) .record(kp.OpTensorSyncLocal([tensor])) .eval()) assert np.all(tensor.data() == np.array([-3, -3, -3], dtype=np.int32))
def test_logistic_regression(): @ps.python2shader def compute_shader(index=("input", "GlobalInvocationId", ps.ivec3), x_i=("buffer", 0, ps.Array(ps.f32)), x_j=("buffer", 1, ps.Array(ps.f32)), y=("buffer", 2, ps.Array(ps.f32)), w_in=("buffer", 3, ps.Array(ps.f32)), w_out_i=("buffer", 4, ps.Array(ps.f32)), w_out_j=("buffer", 5, ps.Array(ps.f32)), b_in=("buffer", 6, ps.Array(ps.f32)), b_out=("buffer", 7, ps.Array(ps.f32)), l_out=("buffer", 8, ps.Array(ps.f32)), M=("buffer", 9, ps.Array(ps.f32))): i = index.x m = M[0] w_curr = vec2(w_in[0], w_in[1]) b_curr = b_in[0] x_curr = vec2(x_i[i], x_j[i]) y_curr = y[i] z_dot = w_curr @ x_curr z = z_dot + b_curr y_hat = 1.0 / (1.0 + exp(-z)) d_z = y_hat - y_curr d_w = (1.0 / m) * x_curr * d_z d_b = (1.0 / m) * d_z loss = -((y_curr * log(y_hat)) + ((1.0 + y_curr) * log(1.0 - y_hat))) w_out_i[i] = d_w.x w_out_j[i] = d_w.y b_out[i] = d_b l_out[i] = loss mgr = kp.Manager(0) # First we create input and ouput tensors for shader tensor_x_i = mgr.tensor(np.array([0.0, 1.0, 1.0, 1.0, 1.0])) tensor_x_j = mgr.tensor(np.array([0.0, 0.0, 0.0, 1.0, 1.0])) tensor_y = mgr.tensor(np.array([0.0, 0.0, 0.0, 1.0, 1.0])) tensor_w_in = mgr.tensor(np.array([0.001, 0.001])) tensor_w_out_i = mgr.tensor(np.array([0.0, 0.0, 0.0, 0.0, 0.0])) tensor_w_out_j = mgr.tensor(np.array([0.0, 0.0, 0.0, 0.0, 0.0])) tensor_b_in = mgr.tensor(np.array([0.0])) tensor_b_out = mgr.tensor(np.array([0.0, 0.0, 0.0, 0.0, 0.0])) tensor_l_out = mgr.tensor(np.array([0.0, 0.0, 0.0, 0.0, 0.0])) tensor_m = mgr.tensor(np.array([tensor_y.size()])) # We store them in an array for easier interaction params = [ tensor_x_i, tensor_x_j, tensor_y, tensor_w_in, tensor_w_out_i, tensor_w_out_j, tensor_b_in, tensor_b_out, tensor_l_out, tensor_m ] mgr.sequence().eval(kp.OpTensorSyncDevice(params)) # Create a managed sequence sq = mgr.sequence() # Record operation to sync memory from local to GPU memory sq.record(kp.OpTensorSyncDevice([tensor_w_in, tensor_b_in])) # Record operation to execute GPU shader against all our parameters sq.record( kp.OpAlgoDispatch(mgr.algorithm(params, compute_shader.to_spirv()))) # Record operation to sync memory from GPU to local memory sq.record( kp.OpTensorSyncLocal( [tensor_w_out_i, tensor_w_out_j, tensor_b_out, tensor_l_out])) ITERATIONS = 100 learning_rate = 0.1 # Perform machine learning training and inference across all input X and Y for i_iter in range(ITERATIONS): # Execute an iteration of the algorithm sq.eval() # Calculate the parameters based on the respective derivatives calculated for j_iter in range(tensor_b_out.size()): tensor_w_in.data( )[0] -= learning_rate * tensor_w_out_i.data()[j_iter] tensor_w_in.data( )[1] -= learning_rate * tensor_w_out_j.data()[j_iter] tensor_b_in.data( )[0] -= learning_rate * tensor_b_out.data()[j_iter] assert tensor_w_in.data()[0] < 0.01 assert tensor_w_in.data()[0] > 0.0 assert tensor_w_in.data()[1] > 1.5 assert tensor_b_in.data()[0] < 0.7
import sys import time import sh_conv import sh_common if len(sys.argv) != 3: print("run_vgg7.py INPUT OUTPUT") print(" Tiling is not implemented, but padding is implemented") sys.exit(1) # NOTES: # + Tiling is not implemented, but padding is implemented # So don't run anything too big through it if False: kpm = kp.Manager(1) if kpm.get_device_properties()["device_name"].count("RAVEN") > 0: raise "Safety cut-out triggered. Sorry!" else: kpm = kp.Manager() image = sh_common.image_load(sys.argv[1]) image = image.repeat(2, 0).repeat(2, 1) image = numpy.pad(image, [[7, 7], [7, 7], [0, 0]], mode="edge") # Ensure image has 4 channels even though they will be unused. # This is because of vectorization vec4 magic. while image.shape[2] < sh_common.VSZ: image = numpy.pad(image, [[0, 0], [0, 0], [0, 1]], mode="constant") # sh_common.image_save("pad.png", image)
def render_base(args, folder): SIZE = (args.width, args.height) # pygame setup if visual enabled surf = None if (args.vis): pygame.init() surf = pygame.display.set_mode(SIZE) # change verbosity level kp_logger = logging.getLogger("kp") kp_logger.setLevel(50 - (max(min(args.verbose, 4), 0) * 10)) # init manager mgr = kp.Manager(args.device) # shader inputs tensor_size = kp.Tensor(SIZE) tensor_frame = kp.Tensor([0]) tensor_offset = kp.Tensor([0]) tensor_out = kp.Tensor(np.zeros((SIZE[0] * SIZE[1] * 3))) # allocate memory on gpu mgr.eval_tensor_create_def([tensor_out, tensor_size, tensor_frame, tensor_offset]) # read shader f = open(folder + args.scene + ".spv", "rb") # create sequences sq_sdf = mgr.create_sequence() sq_sdf.begin() sq_sdf.record_tensor_sync_device([tensor_frame]) sq_sdf.end() sq_sdo = mgr.create_sequence() sq_sdo.begin() sq_sdo.record_tensor_sync_device([tensor_offset]) sq_sdo.end() sq_r = mgr.create_sequence() sq_r.begin() sq_r.record_algo_data([tensor_out, tensor_size, tensor_frame, tensor_offset], f.read()) sq_r.end() sq_sl = mgr.create_sequence() sq_sl.begin() sq_sl.record_tensor_sync_local([tensor_out]) sq_sl.end() # close shader file f.close() # render frames for i in range(args.start, args.end + 1): if (args.verbose > 0): print("rendering frame {}".format(i)) # run program tensor_frame[0] = i # copy frame to shader sq_sdf.eval() # split into smaller chunks for j in range(16): if (args.verbose > 1): print("- rendering chunk {}".format(j)) tensor_offset[0] = j # copy offset to shader sq_sdo.eval() # run shader sq_r.eval() # copy frame from shader sq_sl.eval() # save frame to output frame = np.flip(np.array(tensor_out.data()).reshape((SIZE[1], SIZE[0], 3)), axis=0) plt.imsave("output/image{}.png".format(i), frame) # visualize if (args.vis): # create surface from array surf2 = pygame.surfarray.make_surface(np.swapaxes(frame, 0, 1) * 255) # weird pygame bug surf.blit(surf2, (0, 0)) pygame.display.update() surf.blit(surf2, (0, 0)) pygame.display.update() # stop on last frame if (i == args.end): while True: for event in pygame.event.get(): if event.type == pygame.QUIT: quit()