def top(input, filter, bias, ): input_extent_3_required_s = (((((final_extent_2 + 31)//32) * final_extent_3) + -1)//hcl.select(((final_extent_2 + 31)//32) > 1, ((final_extent_2 + 31)//32), 1)) final_total_extent_1 = (hcl.cast(dtype = hcl.Int(bits = 64), expr = final_extent_1) * hcl.cast(dtype = hcl.Int(bits = 64), expr = final_extent_0)) final_total_extent_2 = (final_total_extent_1 * hcl.cast(dtype = hcl.Int(bits = 64), expr = final_extent_2)) final_total_extent_3 = (final_total_extent_2 * hcl.cast(dtype = hcl.Int(bits = 64), expr = final_extent_3)) f_conv_n_extent_realized_s = hcl.select(hcl.select((((final_extent_2 * final_extent_3) + -1)//hcl.select(final_extent_2 > 1, final_extent_2, 1)) > (final_extent_3 + -1), (((final_extent_2 * final_extent_3) + -1)//hcl.select(final_extent_2 > 1, final_extent_2, 1)), (final_extent_3 + -1)) > (((((final_extent_2 + 31)//32) * final_extent_3) + -1)//(hcl.select(((final_extent_2 + -1)//32) > 0, ((final_extent_2 + -1)//32), 0) + 1)), hcl.select((((final_extent_2 * final_extent_3) + -1)//hcl.select(final_extent_2 > 1, final_extent_2, 1)) > (final_extent_3 + -1), (((final_extent_2 * final_extent_3) + -1)//hcl.select(final_extent_2 > 1, final_extent_2, 1)), (final_extent_3 + -1)), (((((final_extent_2 + 31)//32) * final_extent_3) + -1)//(hcl.select(((final_extent_2 + -1)//32) > 0, ((final_extent_2 + -1)//32), 0) + 1))) f_conv_z_extent_realized = hcl.select(((hcl.select(((final_extent_2 + -1)//32) > 0, ((final_extent_2 + -1)//32), 0) * 32) + 32) > final_extent_2, ((hcl.select(((final_extent_2 + -1)//32) > 0, ((final_extent_2 + -1)//32), 0) * 32) + 32), final_extent_2) f_conv = hcl.compute((final_extent_0, ((((final_extent_1 + -1)//32) * 32) + 32), f_conv_z_extent_realized, (f_conv_n_extent_realized_s + 1)), lambda x, y, z, w: 0, name = "f_conv", dtype = hcl.Float(bits = 32)) with hcl.Stage("f_conv"): with hcl.for_(0, (final_extent_2 * final_extent_3), name = "f_conv_s0_z_par") as f_conv_s0_z_par: with hcl.for_(final_min_1, final_extent_1, name = "f_conv_s0_y") as f_conv_s0_y: with hcl.for_(final_min_0, final_extent_0, name = "f_conv_s0_x") as f_conv_s0_x: f_conv[f_conv_s0_x, f_conv_s0_y, ((f_conv_s0_z_par % hcl.select(final_extent_2 > 1, final_extent_2, 1)) + final_min_2), ((f_conv_s0_z_par//hcl.select(final_extent_2 > 1, final_extent_2, 1)) + final_min_3)] = bias[((f_conv_s0_z_par % hcl.select(final_extent_2 > 1, final_extent_2, 1)) + final_min_2)] with hcl.for_(0, (((final_extent_2 + 31)//32) * final_extent_3), name = "f_conv_s1_z_z_par") as f_conv_s1_z_z_par: f_conv_s1_z_z_t_base_s = (f_conv_s1_z_z_par % hcl.select(((final_extent_2 + 31)//32) > 1, ((final_extent_2 + 31)//32), 1)) with hcl.for_(0, 32, name = "f_conv_s1_r__z") as f_conv_s1_r__z: with hcl.for_(0, ((final_extent_1 + 31)//32), name = "f_conv_s1_y_y") as f_conv_s1_y_y: with hcl.for_(0, 32, name = "f_conv_s1_z_z_t") as f_conv_s1_z_z_t: with hcl.for_(0, 32, name = "f_conv_s1_y_y_t") as f_conv_s1_y_y_t: with hcl.for_(final_min_0, final_extent_0, name = "f_conv_s1_x") as f_conv_s1_x: with hcl.for_(0, 3, name = "f_conv_s1_r__y_r21") as f_conv_s1_r__y_r21: with hcl.for_(0, 3, name = "f_conv_s1_r__x_r20") as f_conv_s1_r__x_r20: t51_s = (f_conv_s1_z_z_par//hcl.select(((final_extent_2 + 31)//32) > 1, ((final_extent_2 + 31)//32), 1)) f_conv[f_conv_s1_x, (((f_conv_s1_y_y * 32) + final_min_1) + f_conv_s1_y_y_t), (((f_conv_s1_z_z_t_base_s * 32) + final_min_2) + f_conv_s1_z_z_t), ((f_conv_s1_z_z_par//hcl.select(((final_extent_2 + 31)//32) > 1, ((final_extent_2 + 31)//32), 1)) + final_min_3)] = (f_conv[f_conv_s1_x, (((f_conv_s1_y_y * 32) + final_min_1) + f_conv_s1_y_y_t), (((f_conv_s1_z_z_t_base_s * 32) + final_min_2) + f_conv_s1_z_z_t), (final_min_3 + t51_s)] + (filter[f_conv_s1_r__x_r20, f_conv_s1_r__y_r21, f_conv_s1_r__z, (((f_conv_s1_z_z_t_base_s * 32) + final_min_2) + f_conv_s1_z_z_t)] * input[(f_conv_s1_r__x_r20 + f_conv_s1_x), ((((f_conv_s1_y_y * 32) + final_min_1) + f_conv_s1_y_y_t) + f_conv_s1_r__y_r21), f_conv_s1_r__z, (final_min_3 + t51_s)])) final = hcl.compute((64, 64, 32, 4), lambda x, y, z, w: 0, name = "final", dtype = hcl.Float(bits = 32)) with hcl.Stage("final"): with hcl.for_(final_min_3, final_extent_3, name = "final_s0_n") as final_s0_n: with hcl.for_(final_min_2, final_extent_2, name = "final_s0_z") as final_s0_z: with hcl.for_(final_min_1, final_extent_1, name = "final_s0_y") as final_s0_y: with hcl.for_(final_min_0, final_extent_0, name = "final_s0_x") as final_s0_x: final[final_s0_x, final_s0_y, final_s0_z, final_s0_n] = hcl.select(f_conv[final_s0_x, final_s0_y, final_s0_z, final_s0_n] > hcl.cast(dtype = hcl.Float(bits = 32), expr = 0.000000), f_conv[final_s0_x, final_s0_y, final_s0_z, final_s0_n], hcl.cast(dtype = hcl.Float(bits = 32), expr = 0.000000)) return final
def kernel(A, B): C = hcl.compute((10, 32), lambda *args: A[args] + B[args], "C") with hcl.Stage("Super") as m: hcl.update(C, lambda *args: C[args] + 1, "update") with hcl.Stage("Plus") as stage: with hcl.for_(0, 10) as j: C[j, 0] = 10 return C
def top(input, ): final_total_extent_1 = ( hcl.cast(dtype=hcl.Int(bits=64), expr=final_extent_1) * hcl.cast(dtype=hcl.Int(bits=64), expr=final_extent_0)) max_local = hcl.compute((final_extent_0, final_extent_1), lambda x, y: 0, name="max_local", dtype=hcl.UInt(bits=16)) with hcl.Stage("max_local"): with hcl.for_(final_min_1, final_extent_1, name="max_local_s0_y") as max_local_s0_y: with hcl.for_(final_min_0, final_extent_0, name="max_local_s0_x") as max_local_s0_x: maximum = hcl.compute((1, 1), lambda x, y: 0, name="maximum", dtype=hcl.UInt(bits=16)) with hcl.Stage("maximum"): maximum[max_local_s0_x, max_local_s0_y] = hcl.cast(dtype=hcl.UInt(bits=16), expr=0) with hcl.for_( 0, 3, name="maximum_s1_box__y") as maximum_s1_box__y: with hcl.for_( 0, 3, name="maximum_s1_box__x") as maximum_s1_box__x: maximum[max_local_s0_x, max_local_s0_y] = hcl.select( maximum[max_local_s0_x, max_local_s0_y] > input[(max_local_s0_x + maximum_s1_box__x), (max_local_s0_y + maximum_s1_box__y)], maximum[max_local_s0_x, max_local_s0_y], input[(max_local_s0_x + maximum_s1_box__x), (max_local_s0_y + maximum_s1_box__y)]) max_local[max_local_s0_x, max_local_s0_y] = maximum[max_local_s0_x, max_local_s0_y] final = hcl.compute((640, 480), lambda x, y: 0, name="final", dtype=hcl.UInt(bits=16)) with hcl.Stage("final"): with hcl.for_(final_min_1, final_extent_1, name="final_s0_y") as final_s0_y: with hcl.for_(final_min_0, final_extent_0, name="final_s0_x") as final_s0_x: final[final_s0_x, final_s0_y] = max_local[final_s0_x, final_s0_y] return final
def kernel(trainData, testData, itemMem, idMem, rdv1, rdv2): def train_encoding(m, preTrainData): train_temp = hcl.compute((trainData.shape[1], dim), lambda x, y: itemMem[trainData[m][x]][y] ^ idMem[x][y], name = "train_temp") k1 = hcl.reduce_axis(0, trainData.shape[1], 'k1') train_result = hcl.compute((dim,), lambda x: hcl.sum(train_temp[k1, x], axis = k1, dtype=hcl.Int()), name = "train_result") with hcl.for_(0, dim) as n: preTrainData[m][n] = train_result[n] with hcl.if_((m + 1) % 1000 == 0): hcl.print((m+1), "Finish encoding %d training data\n") def test_encoding(m, preTestData): test_temp = hcl.compute((testData.shape[1], dim), lambda x, y: itemMem[testData[m][x]][y]^idMem[x][y], name = "test_temp") k2 = hcl.reduce_axis(0, testData.shape[1], 'k2') test_result = hcl.compute((dim,), lambda x: hcl.sum(test_temp[k2, x], axis = k2, dtype=hcl.Int()), name = "test_result") with hcl.for_(0, dim) as n: preTestData[m][n] = test_result[n] with hcl.if_((m+1)%100 == 0): hcl.print((m+1), "Finish encoding %d testing data\n") #Encoding hcl.print((), "Encoding the training data into HDVs.\n") preTrainData = hcl.compute((trainData.shape[0], dim), lambda x, y: 0, "preTrainData") hcl.mutate((trainData.shape[0], ), lambda x: train_encoding(x, preTrainData)) hdTrainData = hcl.compute((trainData.shape[0], dim), lambda x, y: 0, "hdTrainData", dtype=hcl.UInt(1)) with hcl.Stage("S1"): with hcl.if_(trainData.shape[1] % 2 == 0): hcl.print((), "Use the random vector\n") hcl.update(hdTrainData, lambda x, y: hcl.select(preTrainData[x][y] + rdv1[x][y] - trainData.shape[1]/2 > 0, 1, 0)) with hcl.else_(): hcl.update(hdTrainData, lambda x, y: hcl.select(preTrainData[x][y] - trainData.shape[1]/2 > 0, 1, 0)) hcl.print((),"Encoding the testing data into HDVs.\n") preTestData = hcl.compute((testData.shape[0], dim), lambda x, y: 0, "preTestData") hcl.mutate((testData.shape[0], ), lambda x: test_encoding(x, preTestData)) hdTestData = hcl.compute((testData.shape[0], dim), lambda x, y: 0, "hdTestData", dtype=hcl.UInt(1)) with hcl.Stage("S2"): with hcl.if_(testData.shape[1] % 2 == 0): hcl.print((), "Use the random vector\n") hcl.update(hdTestData, lambda x, y: hcl.select(preTestData[x][y] + rdv2[x][y] - testData.shape[1]/2 > 0, 1, 0)) with hcl.else_(): hcl.update(hdTestData, lambda x, y: hcl.select(preTestData[x][y] - testData.shape[1]/2 > 0, 1, 0)) ###data_packing pack_train = hcl.pack(hdTrainData, axis=1, dtype=hcl.UInt(bw), name="pack_train") pack_test = hcl.pack(hdTestData, axis=1, dtype=hcl.UInt(bw), name="pack_test") return pack_train, pack_test
def fft(X_real, X_imag, IndexTable, F_real, F_imag): L = X_real.shape[0] if np.log2(L) % 1 > 0: raise ValueError("Length of input vector (1d tensor) must be power of 2") num_stages = int(np.log2(L)) # bit reverse permutation hcl.update(F_real, lambda i: X_real[IndexTable[i]], name='F_real_update') hcl.update(F_imag, lambda i: X_imag[IndexTable[i]], name='F_imag_update') with hcl.Stage("Out"): one = hcl.scalar(1, dtype="int32") with hcl.for_(0, num_stages) as stage: DFTpts = one[0] << (stage + 1) numBF = DFTpts / 2 e = -2 * np.pi / DFTpts a = hcl.scalar(0) with hcl.for_(0, numBF) as j: c = hcl.scalar(hcl.cos(a[0])) s = hcl.scalar(hcl.sin(a[0])) a[0] = a[0] + e with hcl.for_(j, L + DFTpts - 1, DFTpts) as i: i_lower = i + numBF temp_r = hcl.scalar(F_real[i_lower] * c - F_imag[i_lower] * s) temp_i = hcl.scalar(F_imag[i_lower] * c + F_real[i_lower] * s) F_real[i_lower] = F_real[i] - temp_r[0] F_imag[i_lower] = F_imag[i] - temp_i[0] F_real[i] = F_real[i] + temp_r[0] F_imag[i] = F_imag[i] + temp_i[0]
def kernel(A): with hcl.Stage(): with hcl.for_(0, 10) as i: with hcl.for_(0, 10) as j: with hcl.if_(j >= i): hcl.break_() A[i] += j
def kernel(A, B, C, O): dtype_xyz = hcl.Struct({ "x": hcl.Int(), "y": hcl.Int(), "z": hcl.Int() }) dtype_out = hcl.Struct({ "v0": hcl.Int(), "v1": hcl.Int(), "v2": hcl.Int(), "v3": hcl.Int(), "v4": hcl.Int(), "v5": hcl.Int() }) D = hcl.compute(A.shape, lambda x: (A[x], B[x], C[x]), dtype=dtype_xyz) E = hcl.compute(A.shape, lambda x: (D[x].x * D[x].x, D[x].y * D[x].y, D[x].z * D[x].z, D[ x].x * D[x].y, D[x].y * D[x].z, D[x].x * D[x].z), dtype=dtype_out) with hcl.Stage(): with hcl.for_(0, 100) as i: for j in range(0, 6): O[i][j] = E[i].__getattr__("v" + str(j))
def func(data): out = hcl.compute((4, 4), lambda x, y: 0, "out", dtype) with hcl.Stage("S"): with hcl.for_(0, 4, name="i") as i: with hcl.for_(0, 4, name="j") as j: out[i, j] = data[i, j] + 1 return out
def kernel(A, B, C): with hcl.Stage("S"): with hcl.for_(0, 10) as i: # set the LSB of B to be the same as A B[i][0] = A[i][0] # set the lower 4-bit of C C[i][4:0] = A[i]
def algorithm(A, B): @hcl.def_([A.shape, B.shape, ()]) def update_B(A, B, x): B[x] = A[x] + 1 with hcl.Stage(): with hcl.for_(0, 10) as i: update_B(A, B, i)
def kernel(A): with hcl.Stage(): with hcl.if_(A[0] > 5): A[0] = 5 with hcl.elif_(A[0] > 3): A[0] = 3 with hcl.else_(): A[0] = 0
def kernel(A): with hcl.Stage(): i = hcl.scalar(0) with hcl.while_(True): with hcl.if_(i[0] > 5): hcl.break_() A[i[0]] = i[0] i[0] += 1
def kernel(A, B): C = hcl.compute(A.shape, lambda *args : 0, "C") with hcl.Stage("stage"): with hcl.for_(0, 10, name="i") as i: with hcl.for_(0, 32, name="j") as j: B[i, j] = A[i, j] + B[i, j] C[i, j] = 2 * B[i, j] return C
def kernel(A, B): C = hcl.compute((10, 32), lambda *args : 0, "C") D = hcl.compute(C.shape, lambda *args: 0, "D") with hcl.Stage("Super") as m: with hcl.for_(0, 10, name="j") as j: hcl.update(D, lambda *args: j*A[args] + B[args], name="update.D") hcl.update(C, lambda *args: A[args] + j*D[args], name="update.C") return C
def top(input, ): final_total_extent_1 = (hcl.cast(dtype = hcl.Int(bits = 64), expr = final_extent_1) * hcl.cast(dtype = hcl.Int(bits = 64), expr = final_extent_0)) blur_x = hcl.compute((final_extent_0, (final_extent_1 + 2)), lambda x, y: 0, name = "blur_x", dtype = hcl.UInt(bits = 16)) with hcl.Stage("blur_x"): with hcl.for_(final_min_1, (final_extent_1 + 2), name = "blur_x_s0_y") as blur_x_s0_y: with hcl.for_(final_min_0, final_extent_0, name = "blur_x_s0_x") as blur_x_s0_x: blur_x[blur_x_s0_x, blur_x_s0_y] = ((input[(blur_x_s0_x + 2), blur_x_s0_y] + (input[blur_x_s0_x, blur_x_s0_y] + input[(blur_x_s0_x + 1), blur_x_s0_y]))//hcl.cast(dtype = hcl.UInt(bits = 16), expr = 3)) blur_y = hcl.compute((final_extent_0, final_extent_1), lambda x, y: 0, name = "blur_y", dtype = hcl.UInt(bits = 16)) with hcl.Stage("blur_y"): with hcl.for_(final_min_1, final_extent_1, name = "blur_y_s0_y") as blur_y_s0_y: with hcl.for_(final_min_0, final_extent_0, name = "blur_y_s0_x") as blur_y_s0_x: blur_y[blur_y_s0_x, blur_y_s0_y] = ((blur_x[blur_y_s0_x, (blur_y_s0_y + 2)] + (blur_x[blur_y_s0_x, blur_y_s0_y] + blur_x[blur_y_s0_x, (blur_y_s0_y + 1)]))//hcl.cast(dtype = hcl.UInt(bits = 16), expr = 3)) final = hcl.compute((640, 480), lambda x, y: 0, name = "final", dtype = hcl.UInt(bits = 16)) with hcl.Stage("final"): with hcl.for_(final_min_1, final_extent_1, name = "final_s0_y") as final_s0_y: with hcl.for_(final_min_0, final_extent_0, name = "final_s0_x") as final_s0_x: final[final_s0_x, final_s0_y] = blur_y[final_s0_x, final_s0_y] return final
def top(input, ): final_total_extent_1 = ( hcl.cast(dtype=hcl.Int(bits=64), expr=final_extent_1) * hcl.cast(dtype=hcl.Int(bits=64), expr=final_extent_0)) mean_local = hcl.compute((final_extent_0, final_extent_1), lambda x, y: 0, name="mean_local", dtype=hcl.UInt(bits=16)) with hcl.Stage("mean_local"): with hcl.for_(final_min_1, final_extent_1, name="mean_local_s0_y") as mean_local_s0_y: with hcl.for_(final_min_0, final_extent_0, name="mean_local_s0_x") as mean_local_s0_x: mean_local[mean_local_s0_x, mean_local_s0_y] = hcl.cast(dtype=hcl.UInt(bits=16), expr=0) with hcl.for_(final_min_1, final_extent_1, name="mean_local_s1_y") as mean_local_s1_y: with hcl.for_(final_min_0, final_extent_0, name="mean_local_s1_x") as mean_local_s1_x: with hcl.for_( 0, 3, name="mean_local_s1_box__y") as mean_local_s1_box__y: with hcl.for_(0, 3, name="mean_local_s1_box__x" ) as mean_local_s1_box__x: mean_local[mean_local_s1_x, mean_local_s1_y] = ( mean_local[mean_local_s1_x, mean_local_s1_y] + (input[(mean_local_s1_box__x + mean_local_s1_x), (mean_local_s1_box__y + mean_local_s1_y)] // hcl.cast(dtype=hcl.UInt(bits=16), expr=9))) final = hcl.compute((6418, 4818), lambda x, y: 0, name="final", dtype=hcl.UInt(bits=16)) with hcl.Stage("final"): with hcl.for_(final_min_1, final_extent_1, name="final_s0_y") as final_s0_y: with hcl.for_(final_min_0, final_extent_0, name="final_s0_x") as final_s0_x: final[final_s0_x, final_s0_y] = mean_local[final_s0_x, final_s0_y] return final
def test_schedule_intra_stage(): hcl.init() def popcount(A, B): # each element in A is a 32-bit integer with hcl.for_(0, A.shape[0], name="x") as x: with hcl.for_(0, A.shape[1], name="y") as y: B[x, y] = 0 with hcl.for_(0, 32) as i: B[x, y] += A[x, y][i] A = hcl.placeholder((10, 20)) B = hcl.placeholder(A.shape) with hcl.Stage() as C: popcount(A, B) def test_unroll(): s = hcl.create_schedule([A, B]) s[C].unroll(C.x, factor=3) ir = hcl.lower(s) assert "unrolled \"factor\"=3" in str(ir) def test_reorder(): s = hcl.create_schedule([A, B]) s[C].reorder(C.y, C.x) ir = hcl.lower(s) assert str(ir.body.body.body.body).startswith("for (y, 0, 20)") assert str(ir.body.body.body.body.body).startswith("for (x, 0, 10)") def test_fuse(): s = hcl.create_schedule([A, B]) s[C].fuse(C.x, C.y) ir = hcl.lower(s) assert str( ir.body.body.body.body).startswith("for (x.y.fused, 0, 200)") def test_split(): s = hcl.create_schedule([A, B]) s[C].split(C.x, factor=3) ir = hcl.lower(s) assert str(ir.body.body.body.body).startswith("for (x.outer, 0, 4)") assert str( ir.body.body.body.body.body).startswith("for (x.inner, 0, 3)") assert str( ir.body.body.body.body.body.body).startswith("for (y, 0, 20)") assert str(ir.body.body.body.body.body.body.body).startswith( "if ((x.inner < (10 - (x.outer*3))))") test_unroll() test_reorder() test_fuse() test_split()
def vadd_vhls_ip(op1, op2, size, name=None): if name is None: name = "vadd" with hcl.Stage("ExternModule.vadd") as Module: register_tensors([op1, op2]) Module.ext_ip_name = name Module.inputs = [op1, op2, size] # include cpp/hpp files deps = os.path.dirname(os.path.abspath(__file__)) source = ["vadd.cpp"] Module.source = include_dependency(source) create_extern_module(Module, ip_type="HLS")
def insertion_sort(A): # Introduce a stage. with hcl.Stage("S"): # for i in range(1, A.shape[0]) # We can name the axis with hcl.for_(1, A.shape[0], name="i") as i: key = hcl.local(A[i], "key") j = hcl.local(i - 1, "j") # while(j >= 0 && key < A[j]) with hcl.while_(hcl.and_(j >= 0, key < A[j])): A[j + 1] = A[j] j[0] -= 1 A[j + 1] = key[0]
def systolic_array(A, B): # define modules with loop @hcl.def_([(1,), (1,), ()]) def pe(a, b, x): with hcl.if_(x == 0): result = a * b hcl.return_(a) with hcl.elif_(x == 1): hcl.return_(b) with hcl.else_(): hcl.return_(result) # PE = {f'pe_{i}' : partial(pe) for i in range(w*h)} PE = {} for i in range(w * h): with hcl.Stage("pe_{}".format(i)): PE['pe_{}'.format(i)] = partial(pe) # each k calls of update function calculate one block of result matrix # b_row: block row index # b_col: block col index def update(b_row, b_col, k, O): # fetch input localA = [] localB = [] for input_a in range(h): localA.append(hcl.compute((1,), lambda x : A[input_a + h * b_row, k], "localA_{}".format(input_a))) for input_b in range(w): localB.append(hcl.compute((1,), lambda x : B[k, input_b + w * b_col], "localB_{}".format(input_b))) # systolic connection net = [[None] * h] * w for i in range(h + w - 1): for row in range(i + 1): col = i - row if col < 0 or col > w-1 or row > h-1: continue ## instantiate a PE and record partial results input_a = localA[row] if col == 0 else hcl.compute((1,), lambda x : net[row][col-1][0], "input_a{}{}".format(row, col)) input_b = localB[col] if row == 0 else hcl.compute((1,), lambda x : net[row-1][col][1], "input_b{}{}".format(row, col)) out = hcl.compute((3,), lambda x : PE['pe_%d' % (row * w + col)]( input_a, input_b, x), "out_{}{}".format(row, col)) O[row + h * b_row, col + w * b_col] += out[2] net[row][col] = out block_rows = int(m / h) block_cols = int(n / w) O = hcl.compute((m, n), lambda *args : 0, name="Output") hcl.mutate((block_rows, block_cols, k), lambda b_row, b_col, k: update(b_row, b_col, k, O), name="update") return O
def top( A, B, ): final_total_extent_1 = ( hcl.cast(dtype=hcl.Int(bits=64), expr=final_extent_1) * hcl.cast(dtype=hcl.Int(bits=64), expr=final_extent_0)) prod = hcl.compute((final_extent_0, final_extent_1), lambda x, y: 0, name="prod", dtype=hcl.Float(bits=32)) with hcl.Stage("prod"): with hcl.for_(final_min_1, final_extent_1, name="prod_s0_y") as prod_s0_y: with hcl.for_(final_min_0, final_extent_0, name="prod_s0_x") as prod_s0_x: prod[prod_s0_x, prod_s0_y] = hcl.cast(dtype=hcl.Float(bits=32), expr=0.000000) with hcl.for_(final_min_1, final_extent_1, name="prod_s1_y") as prod_s1_y: with hcl.for_(final_min_0, final_extent_0, name="prod_s1_x") as prod_s1_x: with hcl.for_(0, 1024, name="prod_s1_r__x") as prod_s1_r__x: prod[prod_s1_x, prod_s1_y] = (prod[prod_s1_x, prod_s1_y] + (A[prod_s1_x, prod_s1_r__x] * B[prod_s1_r__x, prod_s1_y])) final = hcl.compute((1024, 1024), lambda x, y: 0, name="final", dtype=hcl.Float(bits=32)) with hcl.Stage("final"): with hcl.for_(final_min_1, final_extent_1, name="final_s0_y") as final_s0_y: with hcl.for_(final_min_0, final_extent_0, name="final_s0_x") as final_s0_x: final[final_s0_x, final_s0_y] = prod[final_s0_x, final_s0_y] return final
def kernel(A): B = hcl.compute((2, 2),lambda x, y: 0, "B", dtype) # syntax sugar with hcl.Stage("S"): LB = hcl.compute((2, 4),lambda x, y: 0, "LB", dtype) with hcl.for_(0, 2, name="x") as x: with hcl.for_(0, 2, name="y") as y: with hcl.for_(0, 2, name="LB_i") as LB_i: with hcl.for_(0, 4, name="LB_j") as LB_j: LB[LB_i, LB_j] = A[x * 2 + LB_i, LB_j] val = hcl.scalar(0,"val") with hcl.for_(0, 2, name="r") as r: with hcl.for_(0, 2, name="c") as c: val.v += LB[r, y * 2 + c] B[x, y] = val / 4 return B
def pool(data): out = hcl.compute((2, 2), lambda x, y: 0, "out", dtype) with hcl.Stage("S"): LB = hcl.compute((2, 4), lambda x, y: 0, "LB", dtype) with hcl.for_(0, 2, name="x") as x: with hcl.for_(0, 2, name="y") as y: with hcl.for_(0, 2, name="LB_i") as LB_i: with hcl.for_(0, 4, name="LB_j") as LB_j: LB[LB_i, LB_j] = data[x * 2 + LB_i, LB_j] val = hcl.scalar(0, "val") with hcl.for_(0, 2, name="r") as r: with hcl.for_(0, 2, name="c") as c: val.v += LB[r, y * 2 + c] out[x, y] = val / 4 return out
def top(input, ): final_total_extent_1 = (hcl.cast(dtype = hcl.Int(bits = 64), expr = final_extent_1) * hcl.cast(dtype = hcl.Int(bits = 64), expr = final_extent_0)) final_total_extent_2 = (final_total_extent_1 * hcl.cast(dtype = hcl.Int(bits = 64), expr = final_extent_2)) linear = hcl.compute(((final_extent_0 + 2), (final_extent_1 + 2), final_extent_2), lambda x, y, z: 0, name = "linear", dtype = hcl.Float(bits = 32)) with hcl.Stage("linear"): with hcl.for_(final_min_2, final_extent_2, name = "linear_s0_c") as linear_s0_c: with hcl.for_(final_min_1, (final_extent_1 + 2), name = "linear_s0_y") as linear_s0_y: with hcl.for_(final_min_0, (final_extent_0 + 2), name = "linear_s0_x") as linear_s0_x: t4 = input[linear_s0_x, linear_s0_y, linear_s0_c] linear[linear_s0_x, linear_s0_y, linear_s0_c] = hcl.select((hcl.cast(dtype = hcl.Float(bits = 32), expr = 0.040450) < t4), hcl.power(((t4 * hcl.cast(dtype = hcl.Float(bits = 32), expr = 0.947867)) + hcl.cast(dtype = hcl.Float(bits = 32), expr = 0.052133)), hcl.cast(dtype = hcl.Float(bits = 32), expr = 2.400000)), (t4 * hcl.cast(dtype = hcl.Float(bits = 32), expr = 0.077399))) blur_x = hcl.compute((final_extent_0, (final_extent_1 + 2), final_extent_2), lambda x, y, z: 0, name = "blur_x", dtype = hcl.Float(bits = 32)) with hcl.Stage("blur_x"): with hcl.for_(final_min_2, final_extent_2, name = "blur_x_s0_c") as blur_x_s0_c: with hcl.for_(final_min_1, (final_extent_1 + 2), name = "blur_x_s0_y") as blur_x_s0_y: with hcl.for_(final_min_0, final_extent_0, name = "blur_x_s0_x") as blur_x_s0_x: blur_x[blur_x_s0_x, blur_x_s0_y, blur_x_s0_c] = ((linear[(blur_x_s0_x + 2), blur_x_s0_y, blur_x_s0_c] + (linear[blur_x_s0_x, blur_x_s0_y, blur_x_s0_c] + linear[(blur_x_s0_x + 1), blur_x_s0_y, blur_x_s0_c])) * hcl.cast(dtype = hcl.Float(bits = 32), expr = 0.333333)) blur_y = hcl.compute((final_extent_0, final_extent_1, final_extent_2), lambda x, y, z: 0, name = "blur_y", dtype = hcl.Float(bits = 32)) with hcl.Stage("blur_y"): with hcl.for_(final_min_2, final_extent_2, name = "blur_y_s0_c") as blur_y_s0_c: with hcl.for_(final_min_1, final_extent_1, name = "blur_y_s0_y") as blur_y_s0_y: with hcl.for_(final_min_0, final_extent_0, name = "blur_y_s0_x") as blur_y_s0_x: blur_y[blur_y_s0_x, blur_y_s0_y, blur_y_s0_c] = ((blur_x[blur_y_s0_x, (blur_y_s0_y + 2), blur_y_s0_c] + (blur_x[blur_y_s0_x, blur_y_s0_y, blur_y_s0_c] + blur_x[blur_y_s0_x, (blur_y_s0_y + 1), blur_y_s0_c])) * hcl.cast(dtype = hcl.Float(bits = 32), expr = 0.333333)) srgb = hcl.compute((final_extent_0, final_extent_1, final_extent_2), lambda x, y, z: 0, name = "srgb", dtype = hcl.Float(bits = 32)) with hcl.Stage("srgb"): with hcl.for_(final_min_2, final_extent_2, name = "srgb_s0_c") as srgb_s0_c: with hcl.for_(final_min_1, final_extent_1, name = "srgb_s0_y") as srgb_s0_y: with hcl.for_(final_min_0, final_extent_0, name = "srgb_s0_x") as srgb_s0_x: t5 = blur_y[srgb_s0_x, srgb_s0_y, srgb_s0_c] srgb[srgb_s0_x, srgb_s0_y, srgb_s0_c] = hcl.select((hcl.cast(dtype = hcl.Float(bits = 32), expr = 0.003131) < t5), ((hcl.power(t5, hcl.cast(dtype = hcl.Float(bits = 32), expr = 0.416667)) * hcl.cast(dtype = hcl.Float(bits = 32), expr = 1.055000)) + hcl.cast(dtype = hcl.Float(bits = 32), expr = -0.055000)), (t5 * hcl.cast(dtype = hcl.Float(bits = 32), expr = 12.920000))) final = hcl.compute((766, 1278, 3), lambda x, y, z: 0, name = "final", dtype = hcl.Float(bits = 32)) with hcl.Stage("final"): with hcl.for_(final_min_2, final_extent_2, name = "final_s0_c") as final_s0_c: with hcl.for_(final_min_1, final_extent_1, name = "final_s0_y") as final_s0_y: with hcl.for_(final_min_0, final_extent_0, name = "final_s0_x") as final_s0_x: final[final_s0_x, final_s0_y, final_s0_c] = srgb[final_s0_x, final_s0_y, final_s0_c] return final
def zculling(size_pixels,size,fragment,z_buffer,pixels): pixel_cntr = hcl.scalar(0,dtype=hcl.Int()) with hcl.Stage("S2"): with hcl.for_(0,size) as n: x = hcl.scalar(fragment[n][0],dtype=hcl.Int()) y = hcl.scalar(fragment[n][1],dtype=hcl.Int()) z = hcl.scalar(fragment[n][2]) color = hcl.scalar(fragment[n][3]) with hcl.if_( z < z_buffer[y][x] ): pixels[pixel_cntr][0] = x.v pixels[pixel_cntr][1] = y.v pixels[pixel_cntr][2] = color.v pixel_cntr.v += 1 z_buffer[y][x] = z.v size_pixels[0] = pixel_cntr.v
def test_if(): hcl.init() def absolute(A, B): with hcl.for_(0, A.shape[0], name="x") as x: with hcl.for_(0, A.shape[1], name="y") as y: with hcl.if_(A[x, y] >= 0): B[x, y] = A[x, y] with hcl.else_(): B[x, y] = -A[x, y] A = hcl.placeholder((10, 20), name="A", dtype="float32") B = hcl.placeholder(A.shape, name="B", dtype="float32") with hcl.Stage() as C: absolute(A, B) s = hcl.create_schedule([A, B]) o, i = s[C].split(C.x, factor=3) s[C].reorder(i, o) # test lower ir = hcl.lower(s) assert str(ir.body.body.body.body).startswith("for (x.inner, 0, 3)") assert str(ir.body.body.body.body.body).startswith("for (x.outer, 0, 4)") assert str(ir.body.body.body.body.body.body).startswith("for (y, 0, 20)") assert str(ir.body.body.body.body.body.body.body.condition).startswith( "(x.inner < (10 - (x.outer*3)))") assert str( ir.body.body.body.body.body.body.body.then_case.condition).startswith( "(0.000000f <= A[(y + ((x.inner + (x.outer*3))*20))])") assert str( ir.body.body.body.body.body.body.body.then_case.then_case ).startswith( "B[(y + ((x.inner + (x.outer*3))*20))] = A[(y + ((x.inner + (x.outer*3))*20))]" ) assert str( ir.body.body.body.body.body.body.body.then_case.else_case ).startswith( "B[(y + ((x.inner + (x.outer*3))*20))] = (A[(y + ((x.inner + (x.outer*3))*20))]*-1.000000f)" ) # test build f = hcl.build(s) a_np = np.random.random((A.shape)) a_hcl = hcl.asarray(a_np, dtype="float32") b_hcl = hcl.asarray(np.zeros(B.shape), dtype="float32") f(a_hcl, b_hcl) b_np = np.abs(a_np) np.testing.assert_allclose(b_np, b_hcl.asnumpy())
def byte_swap_rtl(input_vec, ret=None, name=None): if name is None: name = "my_byteswap" Len = input_vec.shape[0] return_tensors = False if ret is None: return_tensors = True ret = hcl.compute(input_vec.shape, lambda *args: 0, "vec") # functional behavior with hcl.Stage("ExternModule") as Module: hcl.update(ret, lambda *args: input_vec[args] << 16 | input_vec[args] >> 16, "swap") dicts = {} dicts["name"] = name tensors = [input_vec] dicts["args"] = [(_.name, _.dtype) for _ in tensors] # declare headers and typedef dicts["header"] = "unsigned int my_byteswap(unsigned int x);" dicts["func"] = """ for (int k = 0; k < {}; k++) {{ vec[k] = my_byteswap({}[k]); }} """.format(Len, input_vec.name) # add dependency files or folders # the dependencies are copied to project folder deps = os.path.dirname(os.path.abspath(__file__)) dicts["deps"] = deps + "/lib1" # custom compilation command (root path: project) # commands executed before impl or emulation dicts["cmds"] = "cd lib1; " + \ "aocl library hdl-comp-pkg opencl_lib.xml -o opencl_lib.aoco;" + \ "aocl library create -name opencl_lib opencl_lib.aoco;" # custom compiler flgas (load custom libs) dicts["flags"] = "-I lib1 -L lib1 -l opencl_lib.aoclib" create_extern_module(Module, dicts, ip_type="rtl") if return_tensors: return ret
def toynn_vhls_ip(input_1, output_1, name=None): if name is None: name = "myproject" # Function behavior definition with hcl.Stage("ExternModule.toyNN") as Module: register_tensors([input_1, output_1]) Module.ext_ip_name = name Module.inputs = [input_1, output_1] # Include cpp/hpp files if not os.path.exists("firmware"): urllib.request.urlretrieve( "https://raw.githubusercontent.com/Hecmay/debug.trace/main/toynn.tar.gz", filename="toynn.tar.gz") os.system("tar -zxvf toynn.tar.gz") source = [ "firmware/myproject.cpp", "firmware/nnet_utils/", "firmware/weights/" ] Module.source = include_dependency(source) create_extern_module(Module, ip_type="HLS")
def vadd_rtl(A, B, length, ret=None, name=None): if name is None: name = "vadd_rtl" Len = A.shape[0] assert A.shape == B.shape, "shape not match" assert Len == length, "shape not match" return_tensors = False if ret is None: return_tensors = True ret = hcl.compute(A.shape, lambda *args: 0, "ret") # functional behavior with hcl.Stage("ExternModule") as Module: hcl.update(ret, lambda *args: A[args] + B[args], "vadd") dicts = {} dicts["name"] = name tensors = [A, B] dicts["args"] = [(_.name, _.dtype) for _ in tensors] # RTL IP is wrapped as a separate OpenCL kernel in Vitis # add dependency files or folders # the dependencies are copied to project folder deps = os.path.dirname(os.path.abspath(__file__)) dicts["deps"] = deps + "/scripts" # custom compilation command (root path: project) # commands executed before impl or emulation dicts["cmds"] = "vivado -mode batch -source " + \ "scripts/gen_xo.tcl -tclargs vadd.xo vadd hw_emu {} {}" # custom compiler flgas (load custom libs) dicts["flags"] = "vadd.xo" create_extern_module(Module, dicts, ip_type="rtl") if return_tensors: return ret
def HJ_PDE_solver(V_new, V_init, thetas): # Calculate spatial derivative based on index and dimension number def spatial_derivative(i, j, k, dim): left = i * j - k right = i * j + k return left, right # Calculate Hamiltonian for every grid point in V_init with hcl.Stage("Hamiltonian"): with hcl.for_(1, V_init.shape[0], name="i") as i: with hcl.for_(1, V_init.shape[1], name="j") as j: with hcl.for_(1, V_init.shape[2], name="k") as k: # Calculate dV_dx dV_dx_L, dV_dx_R = spatial_derivative(i, j, k, 0) dV_dy_L, dV_dy_R = spatial_derivative(i, j, k, 1) dV_dtheta_L, dV_dtheta_R = spatial_derivative(i, j, k, 2) # Calculate average gradient dV_dx_C = (dV_dx_L + dV_dx_R) / 2 dV_dy_C = (dV_dy_L + dV_dy_R) / 2 dV_dtheta_C = (dV_dtheta_L + dV_dtheta_R) / 2 # Get optimal control uOpt = 1 # Velocity v = 1 # Assume that mode is min with hcl.if_(dV_dtheta_C > 0): uOpt = -uOpt # Calculate dynamics function #V_new[i,j,k] = 1 * cos(thetas[k]) * dV_dx_C +1 * sin(thetas[k]) * dV_dy_C +uOpt * dV_theta_C #angle = hcl.scalar(thetas[k], "angle") V_new[i, j, k] = v * hcl.cos(thetas[k]) * dV_dx_C + v * hcl.sin( thetas[k]) * dV_dy_C + dV_dtheta_C * uOpt