def mul_rshift(m, clk, rst, x_datawidth, x_point, x_signed, y_datawidth, y_point, y_signed, mul_width=None, mul_point=None, mul_signed=None): name = _tmp_name('mul') datawidth = max(x_datawidth, y_datawidth) point = max(x_point, y_point) stream = vthread.Stream(m, name, clk, rst, datawidth) x = stream.source('x', x_datawidth, x_point, x_signed) y = stream.source('y', y_datawidth, y_point, y_signed) rshift = stream.source('rshift', signed=False) rshift.width = int(math.ceil(math.log(datawidth, 2))) + 1 z = x * y z.latency = 4 if mul_width is not None: z.width = mul_width if mul_signed is not None: z.signed = mul_signed if mul_point is not None and point != mul_point: z = stream.Cast(z, point=mul_point) z = stream.Sra(z, rshift) stream.sink(z, 'z') return stream
def madd_rshift(m, clk, rst, x_datawidth, x_point, x_signed, y_datawidth, y_point, y_signed, z_datawidth, z_point, z_signed, sum_width=None, sum_point=None, sum_signed=None): name = _tmp_name('madd') datawidth = max(x_datawidth, y_datawidth, z_datawidth) point = max(x_point, y_point, z_point) stream = vthread.Stream(m, name, clk, rst, datawidth) x = stream.source('x', x_datawidth, x_point, x_signed) y = stream.source('y', y_datawidth, y_point, y_signed) z = stream.source('z', z_datawidth, z_point, z_signed) rshift = stream.source('rshift', signed=False) rshift.width = int(math.ceil(math.log(datawidth, 2))) + 1 sum = stream.Madd(x, y, z) sum.latency = 4 if mul_width is not None: sum.width = mul_width if mul_signed is not None: sum.signed = mul_signed if mul_point is not None and point != mul_point: sum = stream.Cast(sum, point=mul_point) sum = stream.Sra(sum, rshift) stream.sink(sum, 'sum') return stream
def mul(m, clk, rst, x_datawidth, x_point, x_signed, y_datawidth, y_point, y_signed, mul_width=None, mul_point=None, mul_signed=None): name = _tmp_name('mul') datawidth = max(x_datawidth, y_datawidth) point = max(x_point, y_point) stream = vthread.Stream(m, name, clk, rst, datawidth) x = stream.source('x', x_datawidth, x_point, x_signed) y = stream.source('y', y_datawidth, y_point, y_signed) z = x * y z.latency = 4 if mul_width is not None: z.width = mul_width if mul_signed is not None: z.signed = mul_signed if mul_point is not None and point != mul_point: z = stream.Cast(z, point=mul_point) stream.sink(z, 'z') return stream
def acc_rshift_round_frac(m, clk, rst, datawidth, point, signed, sum_width=None, sum_point=None, sum_signed=None): name = _tmp_name('acc') stream = vthread.Stream(m, name, clk, rst, datawidth) x = stream.source('x', datawidth, point, signed) rshift = stream.source('rshift', signed=False) rshift.width = int(math.ceil(math.log(datawidth, 2))) + 1 size = stream.constant('size', signed=False) frac = stream.Mux(rshift > 0, stream.Sll(1, rshift - 1), 0) frac.width = sum_width sum, v = stream.ReduceAddValid(x, size, width=sum_width, signed=sum_signed) if sum_point is not None and point != sum_point: sum = stream.Cast(sum, point=sum_point) sum = sum + frac sum = stream.Sra(sum, rshift) stream.sink(sum, 'sum') stream.sink(v, 'valid') return stream
def div_const(m, clk, rst, x_datawidth, x_point, x_signed, y_datawidth, y_point, y_signed, div_width=None, div_point=None, div_signed=None): name = _tmp_name('div_const') datawidth = max(x_datawidth, y_datawidth) point = max(x_point, y_point) stream = vthread.Stream(m, name, clk, rst, datawidth) x = stream.source('x', x_datawidth, x_point, x_signed) y = stream.source('y', y_datawidth, y_point, y_signed) z = stream.Div(x, y) if div_width is not None: z.width = div_width if div_signed is not None: z.signed = div_signed if div_point is not None and point != div_point: z = stream.Cast(z, point=div_point) stream.sink(z, 'z') return stream
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) strm = vthread.Stream(m, 'mystream', clk, rst) a = strm.source('a') size = strm.constant('size') sum, sum_valid = strm.ReduceAddValid(a, size) strm.sink(sum, 'sum', when=sum_valid, when_name='sum_valid') def comp_stream(size, offset): strm.set_source('a', ram_a, offset, size) strm.set_constant('size', size) strm.set_sink('sum', ram_b, offset, 1) strm.run() strm.join() def comp_sequential(size, offset): sum = 0 for i in range(size): a = ram_a.read(i + offset) sum += a ram_b.write(offset, sum) def check(size, offset_stream, offset_seq): all_ok = True for i in range(size): st = ram_b.read(i + offset_stream) sq = ram_b.read(i + offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False if all_ok: print('OK') else: print('NG') def comp(size): offset = 0 myaxi.dma_read(ram_a, offset, 0, size) comp_stream(size, offset) myaxi.dma_write(ram_b, offset, 1024, 1) offset = size myaxi.dma_read(ram_a, offset, 0, size) comp_sequential(size, offset) myaxi.dma_write(ram_b, offset, 1024 * 2, 1) check(1, 0, offset) th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start(32) return m
def mac_rshift_round(m, clk, rst, x_datawidth, x_point, x_signed, y_datawidth, y_point, y_signed, mul_width=None, mul_point=None, mul_signed=None, sum_width=None, sum_point=None, sum_signed=None): name = _tmp_name('mac') datawidth = max(x_datawidth, y_datawidth) point = max(x_point, y_point) stream = vthread.Stream(m, name, clk, rst, datawidth) x = stream.source('x', x_datawidth, x_point, x_signed) y = stream.source('y', y_datawidth, y_point, y_signed) rshift = stream.source('rshift', signed=False) rshift.width = int(math.ceil(math.log(datawidth, 2))) + 1 size = stream.constant('size', signed=False) z = x * y z.latency = 4 if mul_width is not None: z.width = mul_width if mul_signed is not None: z.signed = mul_signed if mul_point is not None and point != mul_point: z = stream.Cast(z, point=mul_point) z = stream.SraRound(z, rshift) sum, v = stream.ReduceAddValid(z, size, width=sum_width, signed=sum_signed) if sum_point is not None and point != sum_point: sum = stream.Cast(sum, point=sum_point) stream.sink(sum, 'sum') stream.sink(v, 'valid') return stream
def mul_rshift_clip(m, clk, rst, x_datawidth, x_point, x_signed, y_datawidth, y_point, y_signed, mul_width=None, mul_point=None, mul_signed=None, out_width=None, out_point=None, out_signed=None, asymmetric_clip=False): name = _tmp_name('mul_rshift_clip') datawidth = max(x_datawidth, y_datawidth) point = max(x_point, y_point) stream = vthread.Stream(m, name, clk, rst, datawidth) x = stream.source('x', x_datawidth, x_point, x_signed) y = stream.source('y', y_datawidth, y_point, y_signed) rshift = stream.source('rshift', signed=False) rshift.width = int(math.ceil(math.log(datawidth, 2))) + 1 z = x * y z.latency = 4 if mul_width is not None: z.width = mul_width if mul_signed is not None: z.signed = mul_signed if mul_point is not None and point != mul_point: z = stream.Cast(z, point=mul_point) z = stream.Sra(z, rshift) p_th = (1 << (out_width - 1)) - 1 if asymmetric_clip: n_th = -1 * p_th - 1 else: n_th = -1 * p_th p_th = p_th >> out_point n_th = n_th >> out_point p = stream.Mux(z > p_th, p_th, z) n = stream.Mux(z < n_th, n_th, z) z = stream.Mux(z >= 0, p, n) if out_width is not None: z.width = out_width if out_signed is not None: z.signed = out_signed if out_point is not None and z.point != out_point: z = stream.Cast(z, point=out_point) stream.sink(z, 'z') return stream
def madd_rshift_clip(m, clk, rst, x_datawidth, x_point, x_signed, y_datawidth, y_point, y_signed, z_datawidth, z_point, z_signed, sum_width=None, sum_point=None, sum_signed=None, out_width=None, out_point=None, out_signed=None): name = _tmp_name('madd') datawidth = max(x_datawidth, y_datawidth, z_datawidth) point = max(x_point, y_point, z_point) stream = vthread.Stream(m, name, clk, rst, datawidth) x = stream.source('x', x_datawidth, x_point, x_signed) y = stream.source('y', y_datawidth, y_point, y_signed) z = stream.source('z', z_datawidth, z_point, z_signed) rshift = stream.source('rshift', signed=False) rshift.width = int(math.ceil(math.log(datawidth, 2))) + 1 sum = stream.Madd(x, y, z) sum.latency = 4 if mul_width is not None: sum.width = mul_width if mul_signed is not None: sum.signed = mul_signed if mul_point is not None and point != mul_point: sum = stream.Cast(sum, point=mul_point) sum = stream.Sra(sum, rshift) p_th = (1 << (out_width - 1)) - 1 n_th = -1 * p_th p_th = p_th >> out_point n_th = n_th >> out_point p = stream.Mux(sum > p_th, p_th, sum) n = stream.Mux(sum < n_th, n_th, sum) sum = stream.Mux(sum >= 0, p, n) if out_width is not None: sum.width = out_width if out_signed is not None: sum.signed = out_signed if out_point is not None and sum.point != out_point: sum = stream.Cast(sum, point=out_point) stream.sink(sum, 'sum') return stream
def average(m, clk, rst, datawidth, point, signed, num_vars): name = _tmp_name('average') stream = vthread.Stream(m, name, clk, rst, datawidth) vars = [stream.source('var%d' % i, datawidth, point, signed) for i in range(num_vars)] val = stream.Average(*vars) stream.sink(val, 'val') return stream
def add_tree_rshift_round(m, clk, rst, datawidth, point, signed, num_vars): name = _tmp_name('add_tree') stream = vthread.Stream(m, name, clk, rst, datawidth) vars = [stream.source('var%d' % i, datawidth, point, signed) for i in range(num_vars)] rshift = stream.source('rshift', signed=False) rshift.width = int(math.ceil(math.log(datawidth, 2))) + 1 sum = Add3Tree(stream, *vars) sum = stream.SraRound(sum, rshift) stream.sink(sum, 'sum') return stream
def add_tree(m, clk, rst, datawidth, point, signed, num_vars): name = _tmp_name('add_tree') stream = vthread.Stream(m, name, clk, rst, datawidth) vars = [stream.source('var%d' % i, datawidth, point, signed) for i in range(num_vars)] if len(vars) == 1: sum = stream.Cast(vars[0]) else: sum = Add3Tree(stream, *vars) stream.sink(sum, 'sum') return stream
def lshift_rshift(m, clk, rst, x_datawidth, x_point, x_signed, y_datawidth, y_point, y_signed, mul_width=None, mul_point=None, mul_signed=None): if y_point != 0: raise ValueError('not supported') name = _tmp_name('lshift') datawidth = max(x_datawidth, y_datawidth) point = max(x_point, y_point) stream = vthread.Stream(m, name, clk, rst, datawidth) x = stream.source('x', x_datawidth, x_point, x_signed) y = stream.source('y', y_datawidth, y_point, y_signed) rshift = stream.source('rshift', signed=False) rshift.width = int(math.ceil(math.log(datawidth, 2))) + 1 abs_y = stream.Abs(y) sign_y = stream.Sign(y) z = stream.Sll(x, abs_y) z.latency = 0 z = stream.Cast(z, signed=x_signed) z.latency = 0 z = stream.Mux(sign_y, stream.Complement2(z), z) z.latency = 0 z = stream.Cast(z, signed=x_signed) z.latency = 0 if mul_width is not None: z.width = mul_width if mul_signed is not None: z.signed = mul_signed if mul_point is not None and point != mul_point: z = stream.Cast(z, point=mul_point) z.latency = 0 z = stream.SraRound(z, rshift) stream.sink(z, 'z') return stream
def updown_mask_rshift(m, clk, rst, x_datawidth, x_point, x_signed, y_datawidth, y_point, y_signed, mul_width=None, mul_point=None, mul_signed=None): if y_datawidth != 2: raise ValueError('not supported') if y_point != 0: raise ValueError('not supported') if not y_signed: raise ValueError('not supported') name = _tmp_name('updown_mask') datawidth = max(x_datawidth, y_datawidth) point = max(x_point, y_point) stream = vthread.Stream(m, name, clk, rst, datawidth) x = stream.source('x', x_datawidth, x_point, x_signed) y = stream.source('y', y_datawidth, y_point, y_signed) rshift = stream.source('rshift', signed=False) rshift.width = int(math.ceil(math.log(datawidth, 2))) + 1 z = stream.Mux(y > 0, x, stream.Mux(y < 0, stream.Complement2(x), 0)) z.latency = 0 z = stream.Cast(z, signed=x_signed) z.latency = 0 if mul_width is not None: z.width = mul_width if mul_signed is not None: z.signed = mul_signed if mul_point is not None and point != mul_point: z = stream.Cast(z, point=mul_point) z.latency = 0 z = stream.SraRound(z, rshift) stream.sink(z, 'z') return stream
def acc(m, clk, rst, datawidth, point, signed, sum_width=None, sum_point=None, sum_signed=None): name = _tmp_name('acc') stream = vthread.Stream(m, name, clk, rst, datawidth) x = stream.source('x', datawidth, point, signed) size = stream.constant('size', signed=False) sum, v = stream.ReduceAddValid(x, size, width=sum_width, signed=sum_signed) if sum_point is not None and point != sum_point: sum = stream.Cast(sum, point=sum_point) stream.sink(sum, 'sum') stream.sink(v, 'valid') return stream
def add_tree_rshift_round_frac(m, clk, rst, datawidth, point, signed, num_vars): name = _tmp_name('add_tree') stream = vthread.Stream(m, name, clk, rst, datawidth) vars = [stream.source('var%d' % i, datawidth, point, signed) for i in range(num_vars)] rshift = stream.source('rshift', signed=False) rshift.width = int(math.ceil(math.log(datawidth, 2))) + 1 frac = stream.Mux(rshift > 0, stream.Sll(1, rshift - 1), 0) frac.width = datawidth sum = Add3Tree(stream, *(vars + [frac])) sum = stream.Sra(sum, rshift) stream.sink(sum, 'sum') return stream
def div_const_frac(m, clk, rst, x_datawidth, x_point, x_signed, y_datawidth, y_point, y_signed, div_width=None, div_point=None, div_signed=None): name = _tmp_name('div_const_frac') datawidth = max(x_datawidth, y_datawidth) point = max(x_point, y_point) stream = vthread.Stream(m, name, clk, rst, datawidth) x = stream.source('x', x_datawidth, x_point, x_signed) y = stream.source('y', y_datawidth, y_point, y_signed) frac = stream.source('frac') frac.width = datawidth neg_frac = stream.Uminus(frac) neg_frac.width = datawidth neg_frac.latency = 0 frac = stream.Mux(x >= 0, frac, neg_frac) frac.latency = 0 frac.width = datawidth x_frac = stream.Add(x, frac) x_frac.latency = 0 z = stream.Div(x_frac, y) if div_width is not None: z.width = div_width if div_signed is not None: z.signed = div_signed if div_point is not None and point != div_point: z = stream.Cast(z, point=div_point) stream.sink(z, 'z') return stream
def mul_rshift_round_madd(m, clk, rst, x_datawidth, x_point, x_signed, y_datawidth, y_point, y_signed, mul_width=None, mul_point=None, mul_signed=None): name = _tmp_name('mul') datawidth = max(x_datawidth, y_datawidth) point = max(x_point, y_point) stream = vthread.Stream(m, name, clk, rst, datawidth) x = stream.source('x', x_datawidth, x_point, x_signed) y = stream.source('y', y_datawidth, y_point, y_signed) rshift = stream.source('rshift', signed=False) rshift.width = int(math.ceil(math.log(datawidth, 2))) + 1 frac = stream.Mux(rshift > 0, stream.Sll(1, rshift - 1), 0) frac.width = mul_width neg_frac = stream.Uminus(frac) neg_frac.width = datawidth neg_frac.latency = 0 frac = stream.Mux(x >= 0, frac, neg_frac) frac.latency = 0 frac.width = datawidth z = stream.Madd(x, y, frac) z.latency = 4 if mul_width is not None: z.width = mul_width if mul_signed is not None: z.signed = mul_signed if mul_point is not None and point != mul_point: z = stream.Cast(z, point=mul_point) z = stream.Sra(z, rshift) stream.sink(z, 'z') return stream
def reduce_max(m, clk, rst, datawidth, point, signed): name = _tmp_name('_reduce_max') stream = vthread.Stream(m, name, clk, rst, datawidth) x = stream.source('x', datawidth, point, signed) size = stream.constant('size', signed=False) def func(a, b): return vg.Mux(a > b, a, b) if signed: initval = -2**(datawidth - 1) else: initval = 0 data, valid = stream.ReduceCustomValid(func, x, size, initval) stream.sink(data, 'data') stream.sink(valid, 'valid') return stream
def reduce_max(m, clk, rst, datawidth, point, signed): name = _tmp_name('_reduce_max') stream = vthread.Stream(m, name, clk, rst, datawidth) x = stream.source('x', datawidth, point, signed) size = stream.constant('size', signed=False) if signed: initval = -2**(datawidth - 1) else: initval = 0 data, valid = stream.ReduceMaxValid(x, size, initval=initval, width=datawidth, signed=signed) stream.sink(data, 'data') stream.sink(valid, 'valid') return stream
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) size = 16 pattern = [(size, 0)] strm = vthread.Stream(m, 'mystream', clk, rst) a = strm.source('a') b = strm.source('b') sum = a + b strm.sink(sum, 'sum') def comp_stream(offset): strm.set_source_pattern('a', ram_a, offset + 10, pattern) strm.set_source_pattern('b', ram_b, offset + 10, pattern) strm.set_sink('sum', ram_c, offset, size) strm.run() strm.join() def comp_sequential(offset): sum = 0 for i in range(size): a = ram_a.read(offset + 10) b = ram_b.read(offset + 10) sum = a + b ram_c.write(i + offset, sum) def check(size, offset_stream, offset_seq): all_ok = True for i in range(size): st = ram_c.read(i + offset_stream) sq = ram_c.read(i + offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') def comp(): offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_stream(offset) myaxi.dma_write(ram_c, offset, 1024 * 4, 1) offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_sequential(offset) myaxi.dma_write(ram_c, offset, 1024 * 8, 1) check(size, 0, offset) vthread.finish() th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start() return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) shape = [16, 4, 8] size = functools.reduce(lambda x, y: x * y, shape, 1) order = [1, 2, 0] def to_pattern(shape, order): pattern = [] for p in order: size = shape[p] stride = functools.reduce(lambda x, y: x * y, shape[p + 1:], 1) pattern.append((size, stride)) return pattern pattern_a = to_pattern(shape, order) pattern_b = to_pattern(shape, order) pattern_c = to_pattern(shape, order) strm = vthread.Stream(m, 'mystream', clk, rst) a = strm.source('a') b = strm.source('b') c = a + b strm.sink(c, 'c') def comp_stream(offset): strm.set_source_pattern('a', ram_a, offset, pattern_a) strm.set_source_pattern('b', ram_b, offset, pattern_b) strm.set_sink_pattern('c', ram_c, offset, pattern_c) strm.run() strm.join() def comp_sequential(offset): sum = 0 for i in range(size): a = ram_a.read(i + offset) b = ram_b.read(i + offset) sum = a + b ram_c.write(i + offset, sum) def check(offset_stream, offset_seq): all_ok = True st = ram_c.read(offset_stream) sq = ram_c.read(offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False if all_ok: print('OK') else: print('NG') def comp(): # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_stream(offset) myaxi.dma_write(ram_c, offset, 1024 * 4, 1) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_sequential(offset) myaxi.dma_write(ram_c, offset, 1024 * 8, 1) # verification check(0, offset) th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start() return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) strm = vthread.Stream(m, 'mystream', clk, rst) cnt1 = strm.Counter() cnt2 = strm.Counter(initval=1) cnt3 = strm.Counter(initval=2, size=5) cnt4 = strm.Counter(initval=3, interval=3) cnt5 = strm.Counter(initval=4, interval=3, size=7) cnt6 = strm.Counter(initval=4, step=2, interval=2) a = strm.source('a') b = strm.source('b') c = a + b - a - b + cnt1 + cnt2 + cnt3 + cnt4 + cnt5 + cnt6 strm.sink(c, 'c') def comp_stream(size, offset): strm.set_source('a', ram_a, offset, size) strm.set_source('b', ram_b, offset, size) strm.set_sink('c', ram_c, offset, size) strm.run() strm.join() def comp_sequential(size, offset): cnt = 0 for i in range(size): cnt1 = cnt cnt2 = 1 + cnt cnt3 = (cnt + 2) % 5 cnt4 = (cnt // 3) + 3 cnt5 = ((cnt // 3) + 4) % 7 cnt6 = (cnt // 2) * 2 + 4 a = ram_a.read(i + offset) b = ram_b.read(i + offset) sum = a + b - a - b + cnt1 + cnt2 + cnt3 + cnt4 + cnt5 + cnt6 ram_c.write(i + offset, sum) cnt += 1 def check(size, offset_stream, offset_seq): all_ok = True for i in range(size): st = ram_c.read(i + offset_stream) sq = ram_c.read(i + offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') def comp(size): # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_stream(size, offset) myaxi.dma_write(ram_c, offset, 1024, size) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_sequential(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, size) # verification check(size, 0, offset) vthread.finish() th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start(32) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) mulstrm = vthread.Stream(m, 'mul_stream', clk, rst) mulx = mulstrm.source('x') muly = mulstrm.source('y') mulz = mulx * muly mulstrm.sink(mulz, 'z') macstrm = vthread.Stream(m, 'mac_stream', clk, rst) a = macstrm.source('a') b = macstrm.source('b') a = a + 1 b = b + 1 sub = macstrm.substream(mulstrm) sub.to_source('x', a) sub.to_source('y', b) c = sub.from_sink('z') size = macstrm.constant('size') sum, sum_valid = macstrm.ReduceAddValid(c, size) macstrm.sink(sum, 'sum', when=sum_valid, when_name='sum_valid') actstrm = vthread.Stream(m, 'act_stream', clk, rst) a = actstrm.source('a') b = actstrm.source('b') a = a + 1 b = b + 1 a = a + 1 b = b + 1 sub = actstrm.substream(mulstrm) sub.to_source('x', a) sub.to_source('y', b) c = sub.from_sink('z') size = actstrm.constant('size') sum, sum_valid = actstrm.ReduceAddValid(c, size) sum = actstrm.Mux(sum > 0, sum, 0) actstrm.sink(sum, 'sum', when=sum_valid, when_name='sum_valid') def comp_stream_mul(size, offset): mulstrm.set_source('x', ram_a, offset, size) mulstrm.set_source('y', ram_b, offset, size) mulstrm.set_sink('z', ram_c, offset, size) mulstrm.run() mulstrm.join() def comp_stream_mac(size, offset): macstrm.set_source('a', ram_a, offset, size) macstrm.set_source('b', ram_b, offset, size) macstrm.set_constant('size', size) macstrm.set_sink('sum', ram_c, offset, 1) macstrm.run() macstrm.join() def comp_stream_act(size, offset): actstrm.set_source('a', ram_a, offset, size) actstrm.set_source('b', ram_b, offset, size) actstrm.set_constant('size', size) actstrm.set_sink('sum', ram_c, offset, 1) actstrm.run() actstrm.join() def comp_sequential_mul(size, offset): sum = 0 for i in range(size): a = ram_a.read(i + offset) b = ram_b.read(i + offset) sum = a * b ram_c.write(i + offset, sum) def comp_sequential_mac(size, offset): sum = 0 for i in range(size): a = ram_a.read(i + offset) + 1 b = ram_b.read(i + offset) + 1 sum += a * b ram_c.write(offset, sum) def comp_sequential_act(size, offset): sum = 0 for i in range(size): a = ram_a.read(i + offset) + 2 b = ram_b.read(i + offset) + 2 sum += a * b if sum <= 0: sum = 0 ram_c.write(offset, sum) def check(size, offset_stream, offset_seq): all_ok = True for i in range(size): st = ram_c.read(i + offset_stream) sq = ram_c.read(i + offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False print(i, st, sq) if all_ok: print('OK') else: print('NG') def comp(size): # mul # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_stream_mul(size, offset) myaxi.dma_write(ram_c, offset, 1024, size) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_sequential_mul(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, size) # verification print('# MUL') check(size, 0, offset) # mac # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_stream_mac(size, offset) myaxi.dma_write(ram_c, offset, 1024, 1) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_sequential_mac(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, 1) # verification print('# MAC') check(1, 0, offset) # act # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_stream_act(size, offset) myaxi.dma_write(ram_c, offset, 1024, 1) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_sequential_act(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, 1) # verification print('# ACT') check(1, 0, offset) # mac 2 # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_stream_mac(size, offset) myaxi.dma_write(ram_c, offset, 1024, 1) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_sequential_mac(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, 1) # verification print('# MAC') check(1, 0, offset) # act 2 # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_stream_act(size, offset) myaxi.dma_write(ram_c, offset, 1024, 1) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_sequential_act(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, 1) # verification print('# ACT') check(1, 0, offset) th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start(32) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) strm = vthread.Stream(m, 'mystream', clk, rst) size = strm.constant('size') cnt, valid = strm.CounterValid(size) a = strm.source('a') b = strm.source('b') cntval = strm.Mux(valid, 1000, cnt) c = a + b + cntval strm.sink(c, 'c') def comp_stream(size, offset): strm.set_constant('size', size // 2) strm.set_source('a', ram_a, offset, size) strm.set_source('b', ram_b, offset, size) strm.set_sink('c', ram_c, offset, size) strm.run() strm.join() def comp_sequential(size, offset): sum = 0 cnt = 0 for i in range(size): a = ram_a.read(i + offset) b = ram_b.read(i + offset) sum = a + b + cnt cnt += 1 if cnt == 1001: cnt = 0 if cnt == size // 2 - 1: cnt = 1000 ram_c.write(i + offset, sum) def check(size, offset_stream, offset_seq): all_ok = True for i in range(size): st = ram_c.read(i + offset_stream) sq = ram_c.read(i + offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') def comp(size): # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_stream(size, offset) myaxi.dma_write(ram_c, offset, 1024, size) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) comp_sequential(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, size) # verification check(size, 0, offset) vthread.finish() th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start(32) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) strm = vthread.Stream(m, 'mystream', clk, rst) a = strm.source('a') b = strm.source('b') c = a + b v = strm.Ands(c > 140, c < 150) cnt = strm.ReduceAdd(v) strm.sink(c, 'c', when=v, when_name='v') strm.sink(cnt, 'cnt') def comp_stream(size, offset): strm.set_source('a', ram_a, offset, size) strm.set_source('b', ram_b, offset, size) strm.set_sink('c', ram_c, offset, 0) # max_size strm.set_sink_immediate('cnt', 0) # max_size strm.run() strm.join() cnt = strm.read_sink('cnt') print('# num of counted: %d' % cnt) return cnt def comp_sequential(size, offset): sum = 0 addr = 0 for i in range(size): a = ram_a.read(i + offset) b = ram_b.read(i + offset) c = a + b if c > 140 and c < 150: ram_c.write(addr + offset, c) addr += 1 print('# num of counted: %d' % addr) return addr def check(size, offset_stream, offset_seq): all_ok = True for i in range(size): st = ram_c.read(i + offset_stream) sq = ram_c.read(i + offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') def comp(size): # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) cnt = comp_stream(size, offset) myaxi.dma_write(ram_c, offset, 1024, cnt) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 512, size) cnt = comp_sequential(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, cnt) # verification myaxi.dma_read(ram_c, 0, 1024, cnt) myaxi.dma_read(ram_c, offset, 1024 * 2, cnt) check(cnt, 0, offset) vthread.finish() th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start(32) return m
def mkLed(matrix_size=16): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') seq = Seq(m, 'seq', clk, rst) timer = m.Reg('timer', 32, initval=0) seq(timer.inc()) datawidth = 32 addrwidth = 10 ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) def matmul(matrix_size, a_offset, b_offset, c_offset): start_time = timer comp(matrix_size, a_offset, b_offset, c_offset) end_time = timer time = end_time - start_time print("Time (cycles): %d" % time) check(matrix_size, a_offset, b_offset, c_offset) def strm_madd(strm, size, waddr): a = strm.read(ram_a, 0, size) b = strm.read(ram_b, 0, size) sum, valid = strm.RegionAdd(a * b, size) strm.write(ram_c, waddr, 1, sum, when=valid) def comp(matrix_size, a_offset, b_offset, c_offset): a_addr, c_addr = a_offset, c_offset for i in range(matrix_size): myaxi.dma_read(ram_a, 0, a_addr, matrix_size) b_addr = b_offset for j in range(matrix_size): myaxi.dma_read(ram_b, 0, b_addr, matrix_size) stream.run(matrix_size, j) stream.join() b_addr += matrix_size * (datawidth // 8) myaxi.dma_write(ram_c, 0, c_addr, matrix_size) a_addr += matrix_size * (datawidth // 8) c_addr += matrix_size * (datawidth // 8) def check(matrix_size, a_offset, b_offset, c_offset): all_ok = True c_addr = c_offset for i in range(matrix_size): myaxi.dma_read(ram_c, 0, c_addr, matrix_size) for j in range(matrix_size): v = ram_c.read(j) if i == j and vthread.verilog.NotEql(v, (i + 1) * 2): all_ok = False print("NG [%d,%d] = %d" % (i, j, v)) if i != j and vthread.verilog.NotEql(v, 0): all_ok = False print("NG [%d,%d] = %d" % (i, j, v)) c_addr += matrix_size * (datawidth // 8) if all_ok: print("OK") else: print("NG") stream = vthread.Stream(m, 'strm_madd', clk, rst, strm_madd) th = vthread.Thread(m, 'th_matmul', clk, rst, matmul) fsm = th.start(matrix_size, 0, 1024, 2048) return m
def mkLed(memory_datawidth=128): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 numbanks = 4 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, memory_datawidth) ram_a = vthread.MultibankRAM(m, 'ram_a', clk, rst, datawidth, addrwidth, numbanks=numbanks) ram_b = vthread.MultibankRAM(m, 'ram_b', clk, rst, datawidth, addrwidth, numbanks=numbanks) ram_c = vthread.MultibankRAM(m, 'ram_c', clk, rst, datawidth, addrwidth, numbanks=numbanks) strm = vthread.Stream(m, 'mystream', clk, rst) a = strm.source('a') b = strm.source('b') c = a + b strm.sink(c, 'c') def comp_stream(size, offset): strm.set_source('a', ram_a, offset, size) strm.set_source('b', ram_b, offset, size) strm.set_sink('c', ram_c, offset, size) strm.run() strm.join() def comp_sequential(size, offset): sum = 0 for i in range(size): a = ram_a.read(i + offset) b = ram_b.read(i + offset) sum = a + b ram_c.write(i + offset, sum) def check(size, offset_stream, offset_seq): all_ok = True for i in range(size): st = ram_c.read(i + offset_stream) sq = ram_c.read(i + offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False print(i, st, sq) if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') def comp(size): dma_size = size comp_size = size * numbanks dma_offset = 0 comp_offset = 0 myaxi.dma_read(ram_a, dma_offset, 0, dma_size) myaxi.dma_read(ram_b, dma_offset, 0, dma_size) comp_stream(size, comp_offset) myaxi.dma_write(ram_c, dma_offset, 1024, dma_size) dma_offset = size comp_offset = comp_size myaxi.dma_read(ram_a, dma_offset, 0, dma_size) myaxi.dma_read(ram_b, dma_offset, 0, dma_size) comp_sequential(size, comp_offset) myaxi.dma_write(ram_c, dma_offset, 1024 * 2, dma_size) check(comp_size, 0, comp_offset) vthread.finish() th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start(32) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 reduce_size = 4 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) ram_d = vthread.RAM(m, 'ram_d', clk, rst, datawidth, addrwidth) macstrm = vthread.Stream(m, 'macstream', clk, rst) macstrm_a = macstrm.source('a') macstrm_b = macstrm.source('b') macstrm_const = macstrm.constant('const') macstrm_mul = macstrm_a * macstrm_b macstrm_c, macstrm_v = macstrm.ReduceAddValid(macstrm_mul, macstrm_const) macstrm_v += 0 macstrm.sink(macstrm_c, 'c') macstrm.sink(macstrm_v, 'v') strm = vthread.Stream(m, 'mystream', clk, rst) x = strm.source('x') y = strm.source('y') const = strm.constant('const') sub = strm.substream(macstrm) sub.to_source('a', x) sub.to_source('b', y) sub.to_constant('const', const) z = sub.from_sink('c') v = sub.from_sink('v') z = z + x strm.sink(z, 'z', when=v, when_name='v') def comp_stream_macstrm(size, offset): macstrm.set_source('a', ram_a, offset, size) macstrm.set_source('b', ram_b, offset, size) macstrm.set_constant('const', reduce_size) macstrm.set_sink('c', ram_c, offset, size) macstrm.set_sink('v', ram_d, offset, size) macstrm.run() macstrm.join() def comp_stream_mystrm(size, offset): strm.set_source('x', ram_a, offset, size) strm.set_source('y', ram_b, offset, size) strm.set_constant('const', reduce_size) strm.set_sink('z', ram_c, offset, size // reduce_size) strm.run() strm.join() def comp_sequential_macstrm(size, offset): sum = 0 count = 0 for i in range(size): a = ram_a.read(i + offset) b = ram_b.read(i + offset) sum += a * b count += 1 ram_c.write(i + offset, sum) ram_d.write(i + offset, count == (reduce_size - 1)) if count == reduce_size: sum = 0 count = 0 def comp_sequential_mystrm(size, offset): sum = 0 count = 0 write_offset = offset for i in range(size): x = ram_a.read(i + offset) y = ram_b.read(i + offset) sum += x * y val = sum + x count += 1 if count == reduce_size: ram_c.write(write_offset, val) write_offset += 1 sum = 0 count = 0 def check(size, offset_stream, offset_seq): all_ok = True for i in range(size): st = ram_c.read(i + offset_stream) sq = ram_c.read(i + offset_seq) if vthread.verilog.NotEql(st, sq): all_ok = False print(i, st, sq) if all_ok: print('OK') else: print('NG') def comp(size): # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_stream_macstrm(size, offset) myaxi.dma_write(ram_c, offset, 1024, size) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_sequential_macstrm(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, size) # verification print('# macstream') check(size, 0, offset) # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_stream_mystrm(size, offset) myaxi.dma_write(ram_c, offset, 1024, size // reduce_size) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_sequential_mystrm(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, size // reduce_size) # verification print('# mystream') check(size // reduce_size, 0, offset) th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start(16) return m
def mkLed(): m = Module('blinkled') clk = m.Input('CLK') rst = m.Input('RST') datawidth = 32 addrwidth = 10 reduce_size = 4 myaxi = vthread.AXIM(m, 'myaxi', clk, rst, datawidth) ram_a = vthread.RAM(m, 'ram_a', clk, rst, datawidth, addrwidth) ram_b = vthread.RAM(m, 'ram_b', clk, rst, datawidth, addrwidth) ram_c = vthread.RAM(m, 'ram_c', clk, rst, datawidth, addrwidth) ram_d = vthread.RAM(m, 'ram_d', clk, rst, datawidth, addrwidth) macstrm = vthread.Stream(m, 'macstream', clk, rst) macstrm_a = macstrm.source('a') macstrm_b = macstrm.source('b') macstrm_const = macstrm.parameter('const') macstrm_mul = macstrm_a * macstrm_b macstrm_c, macstrm_v = macstrm.ReduceAddValid(macstrm_mul, macstrm_const) macstrm.sink(macstrm_c, 'c') macstrm.sink(macstrm_v, 'v') macstrm2 = vthread.Stream(m, 'macstream2', clk, rst) macstrm2_a = macstrm2.source('a') macstrm2_b = macstrm2.source('b') macstrm2_const = macstrm2.parameter('const') macstrm2_a = macstrm2_a + 1 macstrm2_a = macstrm2_a - 1 macstrm2_b = macstrm2_b * 1 macsub = macstrm2.substream(macstrm) macsub.to_source('a', macstrm2_a) macsub.to_source('b', macstrm2_b) macsub.to_parameter('const', macstrm2_const) macstrm2_c = macsub.from_sink('c') macstrm2_v = macsub.from_sink('v') macstrm2.sink(macstrm2_c, 'c') macstrm2.sink(macstrm2_v, 'v') neststrm = vthread.Stream(m, 'neststream', clk, rst) neststrm_a = neststrm.source('a') neststrm_b = neststrm.source('b') neststrm_const = neststrm.parameter('const') neststrm_a += 1 neststrm_a += 0 neststrm_b += 1 macsub = neststrm.substream(macstrm2) macsub.to_source('a', neststrm_a) macsub.to_source('b', neststrm_b) macsub.to_parameter('const', neststrm_const) neststrm_c = macsub.from_sink('c') neststrm_c += neststrm_a neststrm_c += 0 neststrm_v = macsub.from_sink('v') neststrm.sink(neststrm_c, 'c') neststrm.sink(neststrm_v, 'v') strm = vthread.Stream(m, 'mystream', clk, rst) x = strm.source('x') y = strm.source('y') const = strm.parameter('const') sub = strm.substream(neststrm) sub.to_source('a', x) sub.to_source('b', y) sub.to_parameter('const', const) z = sub.from_sink('c') v = sub.from_sink('v') z = z + y strm.sink(z, 'z', when=v, when_name='v') all_ok = m.TmpReg(initval=0) def comp_stream_macstrm(size, offset): macstrm2.set_source('a', ram_a, offset, size) macstrm2.set_source('b', ram_b, offset, size) macstrm2.set_parameter('const', reduce_size) macstrm2.set_sink('c', ram_c, offset, size) macstrm2.set_sink('v', ram_d, offset, size) macstrm2.run() macstrm2.join() def comp_stream_mystrm(size, offset): strm.set_source('x', ram_a, offset, size) strm.set_source('y', ram_b, offset, size) strm.set_parameter('const', reduce_size) strm.set_sink('z', ram_c, offset, size // reduce_size) strm.run() strm.join() def comp_sequential_macstrm(size, offset): sum = 0 count = 0 for i in range(size): a = ram_a.read(i + offset) b = ram_b.read(i + offset) sum += a * b count += 1 ram_c.write(i + offset, sum) ram_d.write(i + offset, count == (reduce_size - 1)) if count == reduce_size: sum = 0 count = 0 def comp_sequential_mystrm(size, offset): sum = 0 count = 0 write_offset = offset for i in range(size): x = ram_a.read(i + offset) y = ram_b.read(i + offset) sum += (x + 1) * (y + 1) val = sum + (x + 1) + y count += 1 if count == reduce_size: ram_c.write(write_offset, val) write_offset += 1 sum = 0 count = 0 def check(size, offset_stream, offset_seq): for i in range(size): st = ram_c.read(i + offset_stream) sq = ram_c.read(i + offset_seq) if vthread.verilog.NotEql(st, sq): all_ok.value = False print(i, st, sq) if all_ok: print('# verify: PASSED') else: print('# verify: FAILED') def comp(size): all_ok.value = True # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_stream_macstrm(size, offset) myaxi.dma_write(ram_c, offset, 1024, size) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_sequential_macstrm(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, size) # verification print('# macstream') check(size, 0, offset) # stream offset = 0 myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_stream_mystrm(size, offset) myaxi.dma_write(ram_c, offset, 1024, size // reduce_size) # sequential offset = size myaxi.dma_read(ram_a, offset, 0, size) myaxi.dma_read(ram_b, offset, 0, size) comp_sequential_mystrm(size, offset) myaxi.dma_write(ram_c, offset, 1024 * 2, size // reduce_size) # verification print('# mystream') check(size // reduce_size, 0, offset) th = vthread.Thread(m, 'th_comp', clk, rst, comp) fsm = th.start(16) return m