def instantiate(self, arch_input_chn, psum_chn, arr_x, arr_y, chn_per_word): # PE static configuration (immutable) self.arr_x = arr_x self.arr_y = arr_y self.chn_per_word = chn_per_word self.arch_input_chn = arch_input_chn self.psum_chn = psum_chn self.ifmap = None self.weights = None self.bias = None self.image_size = (0, 0) self.filter_size = (0, 0) self.ifmap_psum_done = True self.pass_done = Reg(False) # State Counters self.curr_set = 0 self.curr_filter = 0 self.iteration = 0 self.fmap_idx = 0
def instantiate(self, depth=2, name=None): self.data = [None] * depth self.depth = depth self.name = name self.rd_ptr = Reg(0) self.wr_ptr = Reg(0)
def instantiate(self, depth, peek_window, enq_window, deq_window): self.data = [0] * depth self.depth = depth self.peek_window = peek_window self.enq_window = enq_window self.deq_window = deq_window self.rd_ptr = Reg(0) self.wr_ptr = Reg(0)
def instantiate(self, locx, locy, bias_chn, ofmap_in_chn, ofmap_out_chn): #ofmap_in self.locx = locx self.locy = locy self.bias_chn = bias_chn self.ofmap_in_chn = ofmap_in_chn self.ofmap_out_chn = ofmap_out_chn self.transform_done = Reg(False) self.stat_type = 'aggregate' self.raw_stats = {'post_tr_alu_comp' : 0, 'post_tr_rf_rd' : 0, 'post_tr_ifmap_rf_wr' : 0}
def instantiate(self, locx, locy, weight_in_chn, weight_out_chn): self.locx = locx self.locy = locy self.weight_in_chn = weight_in_chn self.weight_out_chn = weight_out_chn self.transform_done = Reg(False) self.stat_type = 'aggregate' self.raw_stats = { 'pre_tr_weights_alu_comp': 0, 'pre_tr_weights_rf_rd': 0, 'pre_tr_weights_rf_wr': 0 }
def instantiate(self, locx, locy, ifmap_in_chn, ifmap_out_chn): self.locx = locx self.locy = locy self.ifmap_in_chn = ifmap_in_chn self.ifmap_out_chn = ifmap_out_chn self.transform_done = Reg(False) self.stat_type = 'aggregate' self.raw_stats = { 'pre_tr_ifmap_alu_comp': 0, 'pre_tr_ifmap_rf_rd': 0, 'pre_tr_ifmap_rf_wr': 0 }
def instantiate(self, arch_output_chn, arr_x, arr_y, chn_per_word): # PE static configuration (immutable) self.arr_x = arr_x self.arr_y = arr_y self.chn_per_word = chn_per_word self.arch_output_chn = arch_output_chn self.ofmap = None self.reference = None self.image_size = (0, 0) self.curr_set = 0 self.fmap_idx = 0 self.pass_done = Reg(False)
def instantiate(self, arch_output_chn, arr_y, block_size, num_nonzero): # PE static configuration (immutable) self.arr_y = arr_y self.block_size = block_size self.num_nonzero = num_nonzero self.arch_output_chn = arch_output_chn self.ofmap = None self.reference = None self.image_size = (0, 0) self.curr_set = 0 self.fmap_idx = 0 self.pass_done = Reg(False)
class RegTB(Module): def instantiate(self): self.ra = Reg(0) self.rb = Reg(10) def tick(self): print("ra val: %d" % self.ra.rd()) print("rb val: %d" % self.rb.rd()) self.ra.wr(self.ra.rd() + 1) self.rb.wr(self.ra.rd() + self.rb.rd())
class FIFOTB(Module): def instantiate(self, check_fifo=True): self.check_fifo = check_fifo self.counter = Reg(0) self.fifo = FIFO(4) def tick(self): count = self.counter.rd() self.counter.wr((count + 1) % 256) if count % 4 < 2 and (self.fifo.not_full() or not self.check_fifo): self.fifo.enq(count) print("enq: %d" % count) if count % 4 == 3 and (self.fifo.not_empty() or not self.check_fifo): peek = self.fifo.peek() self.fifo.deq() print("deq: %d" % peek)
def instantiate(self, arch_input_chn, arr_y, block_size, num_nonzero, pruner_name): # PE static configuration (immutable) #self.arr_x = arr_x self.arr_y = arr_y #self.chn_per_word = chn_per_word self.block_size = block_size self.num_nonzero = num_nonzero self.convert_chn = Channel() self.prune_chn = Channel() self.arch_input_chn = arch_input_chn # Although both InputSerializer and pruner will be pushing to arch_input_chn # There is no conflict issue because all weights will be pushed by IS first # then all inputs by pruner self.converter = Converter(self.convert_chn, self.prune_chn, \ self.block_size, self.block_size) # self.pruner = NaivePruner(self.prune_chn,self.arch_input_chn, \ # self.num_nonzero,True) #user defined pruner for this layer, default to naive pruner self.pruner = getattr(pruner, pruner_name)(self.prune_chn,self.arch_input_chn, \ self.num_nonzero, self.block_size, True) self.ifmap = None self.weights = None self.bias = None self.image_size = (0, 0) self.filter_size = (0, 0) self.ifmap_psum_done = True self.pass_done = Reg(False) # State Counters self.curr_set = 0 self.curr_filter = 0 self.iteration = 0 self.fmap_idx = 0 self.curr_chn = 0 self.curr_x = 0 # run through first two dimensions of input self.curr_y = 0 self.bias_set = 0
class InputSerializer(Module): def instantiate(self, arch_input_chn, arr_x, arr_y, chn_per_word): # PE static configuration (immutable) self.arr_x = arr_x self.arr_y = arr_y self.chn_per_word = chn_per_word self.arch_input_chn = arch_input_chn self.ifmap = None self.weights = None self.bias = None self.image_size = (0, 0) self.filter_size = (0, 0) self.ifmap_psum_done = True self.pass_done = Reg(False) # State Counters self.curr_set = 0 self.curr_filter = 0 self.iteration = 0 self.fmap_idx = 0 def configure(self, ifmap, weights, bias, image_size, filter_size): self.ifmap = ifmap self.weights = weights self.bias = bias self.image_size = image_size self.filter_size = filter_size self.ifmap_psum_done = False self.pass_done.wr(False) def tick(self): if self.pass_done.rd(): return in_sets = self.arr_y // self.chn_per_word out_sets = self.arr_x // self.chn_per_word fmap_per_iteration = self.image_size[0] * self.image_size[1] num_iteration = self.filter_size[0] * self.filter_size[1] if not self.ifmap_psum_done: if self.arch_input_chn.vacancy(): # print "input append" x = self.fmap_idx % self.image_size[0] y = self.fmap_idx // self.image_size[0] if self.curr_set < in_sets: cmin = self.curr_set * self.chn_per_word cmax = cmin + self.chn_per_word # Write ifmap to glb data = np.array( [self.ifmap[x, y, c] for c in range(cmin, cmax)]) else: cmin = (self.curr_set - in_sets) * self.chn_per_word cmax = cmin + self.chn_per_word # Write bias to glb data = np.array([self.bias[c] for c in range(cmin, cmax)]) self.arch_input_chn.push(data) self.curr_set += 1 if self.curr_set == (in_sets + out_sets): self.curr_set = 0 self.fmap_idx += 1 if self.fmap_idx == fmap_per_iteration: self.fmap_idx = 0 self.ifmap_psum_done = True # print "---- Wrote inputs and biases ----" else: f_x = self.iteration % self.filter_size[0] f_y = self.iteration // self.filter_size[0] # Push filters to PE columns. (PE is responsible for pop) if self.arch_input_chn.vacancy( ) and self.iteration < num_iteration: cmin = self.curr_set * self.chn_per_word cmax = cmin + self.chn_per_word data = np.array([self.weights[f_x, f_y, c, self.curr_filter] \ for c in range(cmin, cmax) ]) self.arch_input_chn.push(data) self.curr_set += 1 if self.curr_set == in_sets: self.curr_set = 0 self.curr_filter += 1 if self.curr_filter == self.arr_x: self.curr_filter = 0 # print "---- Wrote weights iteration: %d ----" % self.iteration self.iteration += 1 if self.iteration == num_iteration: # print "---- Wrote all weights ----" self.pass_done.wr(True)
class WindowFIFO(Module): def instantiate(self, depth, peek_window, enq_window, deq_window): self.data = [0] * depth self.depth = depth self.peek_window = peek_window self.enq_window = enq_window self.deq_window = deq_window self.rd_ptr = Reg(0) self.wr_ptr = Reg(0) def peek(self): if (self.wr_ptr.rd() - self.rd_ptr.rd()) % (2*self.depth) \ < self.peek_window: raise FIFOError("Reading from empty FIFO") peek_output = [0] * self.peek_window for i in xrange(self.peek_window): peek_output[i] = self.data[(self.rd_ptr.rd() + i) % self.depth] return peek_output def enq(self, x): if (self.wr_ptr.rd() - self.rd_ptr.rd() + self.enq_window - 1) % \ (2*self.depth) >= self.depth: raise FIFOError("Enqueueing into full FIFO") for i in xrange(self.enq_window): self.data[(self.wr_ptr.rd() + i) % self.depth] = x[i] self.wr_ptr.wr((self.wr_ptr.rd() + self.enq_window) % (2 * self.depth)) def deq(self): if (self.wr_ptr.rd() - self.rd_ptr.rd()) % (2*self.depth) \ < self.deq_window: raise FIFOError("Dequeueing from empty FIFO") self.rd_ptr.wr((self.rd_ptr.rd() + self.deq_window) % (2 * self.depth)) def not_full(self): return not ( (self.wr_ptr.rd() - self.rd_ptr.rd() + self.enq_window - 1) % (2 * self.depth) >= self.depth) def valid(self): return not ((self.wr_ptr.rd() - self.rd_ptr.rd()) % (2 * self.depth) < self.peek_window) def not_empty(self): return not ((self.wr_ptr.rd() - self.rd_ptr.rd()) % (2 * self.depth) < self.deq_window) def clear(self): self.rd_ptr.wr(self.wr_ptr.rd())
class OutputDeserializer(Module): def instantiate(self, arch_output_chn, arr_x, arr_y, chn_per_word): # PE static configuration (immutable) self.arr_x = arr_x self.arr_y = arr_y self.chn_per_word = chn_per_word self.arch_output_chn = arch_output_chn self.ofmap = None self.reference = None self.image_size = (0, 0) self.curr_set = 0 self.fmap_idx = 0 self.pass_done = Reg(False) def configure(self, ofmap, reference, image_size, filter_size): self.ofmap = ofmap self.reference = reference self.image_size = image_size self.filter_size = filter_size self.curr_set = 0 self.fmap_idx = 0 self.pass_done.wr(False) def tick(self): if self.pass_done.rd(): return out_sets = self.arr_x // self.chn_per_word #fmap_per_iteration = self.image_size[0]*self.image_size[1] fmap_per_iteration = (self.image_size[0] - self.filter_size[0] + 1) * ( self.image_size[1] - self.filter_size[1] + 1) if self.arch_output_chn.valid(): data = [e for e in self.arch_output_chn.pop()] x = self.fmap_idx % self.image_size[0] y = self.fmap_idx // self.image_size[0] x = self.fmap_idx % (self.image_size[0] - self.filter_size[0] + 1) y = self.fmap_idx // (self.image_size[0] - self.filter_size[0] + 1) if self.curr_set < out_sets: cmin = self.curr_set * self.chn_per_word cmax = cmin + self.chn_per_word for c in range(cmin, cmax): self.ofmap[x, y, c] = data[c - cmin] self.curr_set += 1 if self.curr_set == out_sets: self.curr_set = 0 self.fmap_idx += 1 if self.fmap_idx == fmap_per_iteration: self.fmap_idx = 0 self.pass_done.wr(True) if np.all(self.ofmap == self.reference): raise Finish("Success") else: print(self.ofmap) print(self.reference) print(self.ofmap - self.reference) raise Finish("Validation Failed")
class FIFO(Module): def instantiate(self, depth=2): self.data = [0] * depth self.depth = depth self.rd_ptr = Reg(0) self.wr_ptr = Reg(0) def peek(self): if self.wr_ptr.rd() == self.rd_ptr.rd(): raise FIFOError("Reading from empty FIFO") return self.data[self.rd_ptr.rd() % self.depth] def enq(self, x): if (self.wr_ptr.rd() - self.rd_ptr.rd()) % (2 * self.depth) == self.depth: raise FIFOError("Enqueueing into full FIFO") self.data[self.wr_ptr.rd() % self.depth] = x self.wr_ptr.wr((self.wr_ptr.rd() + 1) % (2 * self.depth)) def deq(self): if self.wr_ptr.rd() == self.rd_ptr.rd(): raise FIFOError("Dequeueing from empty FIFO") self.rd_ptr.wr((self.rd_ptr.rd() + 1) % (2 * self.depth)) def not_full(self): return not ((self.wr_ptr.rd() - self.rd_ptr.rd()) % (2 * self.depth) == self.depth) def not_empty(self): return not (self.wr_ptr.rd() == self.rd_ptr.rd()) def reset(self): self.rd_ptr.wr(self.wr_ptr.rd())
class PreTransformIFMap(Module): def instantiate(self, locx, locy, ifmap_in_chn, ifmap_out_chn): self.locx = locx self.locy = locy self.ifmap_in_chn = ifmap_in_chn self.ifmap_out_chn = ifmap_out_chn self.transform_done = Reg(False) self.stat_type = 'aggregate' self.raw_stats = { 'pre_tr_ifmap_alu_comp': 0, 'pre_tr_ifmap_rf_rd': 0, 'pre_tr_ifmap_rf_wr': 0 } def configure(self): self.iteration = 0 self.push_ctr = 0 self.V = np.zeros([4, 4]).astype(np.int64) self.raw_stats['pre_tr_ifmap_rf_wr'] += 16 # write zeros into rf self.transform_done.wr(False) # Explanation of algorithm: transform ifmap D into V, performing Winograd transform v = B_T*D*M # D = [D00 D01 D02 D03 # D10 D11 D12 D13 # D20 D21 D22 D23 # D30 D31 D32 D33] # # B_T = [1 0 -1 0 # 0 1 1 0 # 0 -1 1 0 # 0 1 0 -1] # # B = [ 1 0 0 0 # 0 1 -1 1 # -1 1 1 0 # 0 0 0 -1] # # Performing this transform yields a 4x4 output for a given 4x4 input: # # V = [v00 v01 v02 v03 # v10 v11 v12 v13 # v20 v21 v22 v23 # v30 v31 v32 v33] # # ... such that: # v00 = (D00 - D02 - D20 + D22); # v01 = (D01 + D02 - D21 - D22); # v02 = (D02 - D01 + D21 - D22); # v03 = (D01 - D03 - D21 + D23); # v10 = (D10 - D12 + D20 - D22); # v11 = (D11 + D12 + D21 + D22); # v12 = (D12 - D11 - D21 + D22); # v13 = (D11 - D13 + D21 - D23); # v20 = (D12 - D10 + D20 - D22); # v21 = (D21 - D12 - D11 + D22); # v22 = (D11 - D12 - D21 + D22); # v23 = (D13 - D11 + D21 - D23); # v30 = (D10 - D12 - D30 + D32); # v31 = (D11 + D12 - D31 - D32); # v32 = (D12 - D11 + D31 - D32); # v33 = (D11 - D13 - D31 + D33); def tick(self): if self.transform_done.rd(): return if self.ifmap_in_chn.valid() and self.ifmap_out_chn.vacancy(): d = (self.ifmap_in_chn.pop()) #print ("pre transform ifmap pop - locx, locy, data: ",self.locx,self.locy,d) if (self.iteration == 0): # get D_00 self.V[0][0] += d self.raw_stats['pre_tr_ifmap_alu_comp'] += 1 self.raw_stats['pre_tr_ifmap_rf_rd'] += 1 self.raw_stats['pre_tr_ifmap_rf_wr'] += 1 self.iteration += 1 elif (self.iteration == 1): # get D_01 self.V[0][1] += d self.V[0][2] -= d self.V[0][3] += d self.raw_stats['pre_tr_ifmap_alu_comp'] += 3 self.raw_stats['pre_tr_ifmap_rf_rd'] += 3 self.raw_stats['pre_tr_ifmap_rf_wr'] += 3 self.iteration += 1 elif (self.iteration == 2): # get D_02 self.V[0][0] -= d self.V[0][1] += d self.V[0][2] += d self.raw_stats['pre_tr_ifmap_alu_comp'] += 3 self.raw_stats['pre_tr_ifmap_rf_rd'] += 3 self.raw_stats['pre_tr_ifmap_rf_wr'] += 3 self.iteration += 1 elif (self.iteration == 3): # get D_03 self.V[0][3] -= d self.raw_stats['pre_tr_ifmap_alu_comp'] += 1 self.raw_stats['pre_tr_ifmap_rf_rd'] += 1 self.raw_stats['pre_tr_ifmap_rf_wr'] += 1 self.iteration += 1 elif (self.iteration == 4): # get D_10 self.V[1][0] += d self.V[2][0] -= d self.V[3][0] += d self.raw_stats['pre_tr_ifmap_alu_comp'] += 3 self.raw_stats['pre_tr_ifmap_rf_rd'] += 3 self.raw_stats['pre_tr_ifmap_rf_wr'] += 3 self.iteration += 1 elif (self.iteration == 5): # get D_11 self.V[1][1] += d self.V[1][2] -= d self.V[1][3] += d self.V[2][1] -= d self.V[2][2] += d self.V[2][3] -= d self.V[3][1] += d self.V[3][2] -= d self.V[3][3] += d self.raw_stats['pre_tr_ifmap_alu_comp'] += 9 self.raw_stats['pre_tr_ifmap_rf_rd'] += 9 self.raw_stats['pre_tr_ifmap_rf_wr'] += 9 self.iteration += 1 elif (self.iteration == 6): # get D_12 self.V[1][0] -= d self.V[1][1] += d self.V[1][2] += d self.V[2][0] += d self.V[2][1] -= d self.V[2][2] -= d self.V[3][0] -= d self.V[3][1] += d self.V[3][2] += d self.raw_stats['pre_tr_ifmap_alu_comp'] += 9 self.raw_stats['pre_tr_ifmap_rf_rd'] += 9 self.raw_stats['pre_tr_ifmap_rf_wr'] += 9 self.iteration += 1 elif (self.iteration == 7): # get D_13 self.V[1][3] -= d self.V[2][3] += d self.V[3][3] -= d self.raw_stats['pre_tr_ifmap_alu_comp'] += 3 self.raw_stats['pre_tr_ifmap_rf_rd'] += 3 self.raw_stats['pre_tr_ifmap_rf_wr'] += 3 self.iteration += 1 elif (self.iteration == 8): # get D_20 self.V[0][0] -= d self.V[1][0] += d self.V[2][0] += d self.raw_stats['pre_tr_ifmap_alu_comp'] += 3 self.raw_stats['pre_tr_ifmap_rf_rd'] += 3 self.raw_stats['pre_tr_ifmap_rf_wr'] += 3 self.iteration += 1 elif (self.iteration == 9): # get D_21 self.V[0][1] -= d self.V[0][2] += d self.V[0][3] -= d self.V[1][1] += d self.V[1][2] -= d self.V[1][3] += d self.V[2][1] += d self.V[2][2] -= d self.V[2][3] += d self.raw_stats['pre_tr_ifmap_alu_comp'] += 9 self.raw_stats['pre_tr_ifmap_rf_rd'] += 9 self.raw_stats['pre_tr_ifmap_rf_wr'] += 9 self.iteration += 1 elif (self.iteration == 10 ): # get D_22 & start pushing transformed data out self.V[0][0] += d self.V[0][1] -= d self.V[0][2] -= d self.V[1][0] -= d self.V[1][1] += d self.V[1][2] += d self.V[2][0] -= d self.V[2][1] += d self.V[2][2] += d self.raw_stats['pre_tr_ifmap_alu_comp'] += 9 self.raw_stats['pre_tr_ifmap_rf_rd'] += 9 self.raw_stats['pre_tr_ifmap_rf_wr'] += 9 self.ifmap_out_chn.push( self.V[self.push_ctr // 4][self.push_ctr % 4]) # push v00 self.raw_stats[ 'pre_tr_ifmap_rf_rd'] -= 1 # push v00 immediately w/o writing to rf #print ("pre transform ifmap - locx, locy, iteration, transformed ifmap: ", \ # self.locx, self.locy, self.iteration, self.V[self.push_ctr // 4][self.push_ctr % 4]) self.push_ctr += 1 self.iteration += 1 elif (self.iteration == 11): # get D_23 self.V[0][3] += d self.V[1][3] -= d self.V[2][3] -= d self.raw_stats['pre_tr_ifmap_alu_comp'] += 3 self.raw_stats['pre_tr_ifmap_rf_rd'] += 3 self.raw_stats['pre_tr_ifmap_rf_wr'] += 3 self.ifmap_out_chn.push( self.V[self.push_ctr // 4][self.push_ctr % 4]) # push v01 self.raw_stats['pre_tr_ifmap_rf_rd'] += 1 # read v01 from rf #print ("pre transform ifmap - locx, locy, iteration, transformed ifmap: ", \ # self.locx, self.locy, self.iteration, self.V[self.push_ctr // 4][self.push_ctr % 4]) self.push_ctr += 1 self.iteration += 1 elif (self.iteration == 12): # get D_30 self.V[3][0] -= d self.raw_stats['pre_tr_ifmap_alu_comp'] += 1 self.raw_stats['pre_tr_ifmap_rf_rd'] += 1 self.raw_stats['pre_tr_ifmap_rf_wr'] += 1 self.ifmap_out_chn.push( self.V[self.push_ctr // 4][self.push_ctr % 4]) # push v02 self.raw_stats['pre_tr_ifmap_rf_rd'] += 1 # read v02 from rf #print ("pre transform ifmap - locx, locy, iteration, transformed ifmap: ", \ # self.locx, self.locy, self.iteration, self.V[self.push_ctr // 4][self.push_ctr % 4]) self.push_ctr += 1 self.iteration += 1 elif (self.iteration == 13): # get D_31 self.V[3][1] -= d self.V[3][2] += d self.V[3][3] -= d self.raw_stats['pre_tr_ifmap_alu_comp'] += 3 self.raw_stats['pre_tr_ifmap_rf_rd'] += 3 self.raw_stats['pre_tr_ifmap_rf_wr'] += 3 self.ifmap_out_chn.push( self.V[self.push_ctr // 4][self.push_ctr % 4]) # push v03 self.raw_stats['pre_tr_ifmap_rf_rd'] += 1 # read v03 from rf #print ("pre transform ifmap - locx, locy, iteration, transformed ifmap: ", \ # self.locx, self.locy, self.iteration, self.V[self.push_ctr // 4][self.push_ctr % 4]) self.push_ctr += 1 self.iteration += 1 elif (self.iteration == 14): # get D_32 self.V[3][0] += d self.V[3][1] -= d self.V[3][2] -= d self.raw_stats['pre_tr_ifmap_alu_comp'] += 3 self.raw_stats['pre_tr_ifmap_rf_rd'] += 3 self.raw_stats['pre_tr_ifmap_rf_wr'] += 3 self.ifmap_out_chn.push( self.V[self.push_ctr // 4][self.push_ctr % 4]) # push v10 self.raw_stats['pre_tr_ifmap_rf_wr'] += 1 # read v10 from rf #print ("pre transform ifmap - locx, locy, iteration, transformed ifmap: ", \ # self.locx, self.locy, self.iteration, self.V[self.push_ctr // 4][self.push_ctr % 4]) self.push_ctr += 1 self.iteration += 1 elif (self.iteration == 15): # get D_33 self.V[3][3] += d self.raw_stats['pre_tr_ifmap_alu_comp'] += 1 self.raw_stats['pre_tr_ifmap_rf_rd'] += 1 self.raw_stats['pre_tr_ifmap_rf_wr'] += 1 self.ifmap_out_chn.push( self.V[self.push_ctr // 4][self.push_ctr % 4]) # push v11 self.raw_stats['pre_tr_ifmap_rf_wr'] += 1 # read v11 from rf #print ("pre transform ifmap - locx, locy, iteration, transformed ifmap: ", \ # self.locx, self.locy, self.iteration, self.V[self.push_ctr // 4][self.push_ctr % 4]) self.push_ctr += 1 self.iteration += 1 elif self.iteration == 16 and self.ifmap_out_chn.vacancy( ): # done computing transform, push remaining V's sequentially self.ifmap_out_chn.push(self.V[self.push_ctr // 4][self.push_ctr % 4]) self.raw_stats['pre_tr_ifmap_rf_rd'] += 1 # read vXX from rf #print ("pre transform ifmap - locx, locy, iteration, transformed ifmap: ", \ # self.locx, self.locy, self.iteration, self.V[self.push_ctr // 4][self.push_ctr % 4]) self.push_ctr += 1 if self.push_ctr == 16: # all 16 transformed ifmap values have been pushed self.transform_done.wr(True)
class OutputDeserializer(Module): def instantiate(self, arch_output_chn, arr_x, arr_y, chn_per_word): # PE static configuration (immutable) self.arr_x = arr_x self.arr_y = arr_y self.chn_per_word = chn_per_word self.arch_output_chn = arch_output_chn self.ofmap = None self.reference = None self.image_size = (0, 0) self.curr_set = 0 self.fmap_idx = 0 self.pass_done = Reg(False) def configure(self, ofmap, reference, image_size): self.ofmap = ofmap self.reference = reference self.image_size = image_size self.curr_set = 0 self.fmap_idx = 0 self.pass_done.wr(False) def tick(self): if self.pass_done.rd(): return # How many psums packets we expect to receive out_sets = self.arr_x // self.chn_per_word fmap_per_iteration = self.image_size[0] * self.image_size[1] if self.arch_output_chn.valid(): data = [e for e in self.arch_output_chn.pop()] # Calculate the end coords of where these ending psums must go... x = self.fmap_idx % self.image_size[0] y = self.fmap_idx // self.image_size[0] if self.curr_set < out_sets: cmin = self.curr_set * self.chn_per_word cmax = cmin + self.chn_per_word for c in range(cmin, cmax): self.ofmap[x, y, c] = data[c - cmin] self.curr_set += 1 # After recieving all the elements for pixel 0, do pixel 1, etc... if self.curr_set == out_sets: self.curr_set = 0 self.fmap_idx += 1 if self.fmap_idx == fmap_per_iteration: self.fmap_idx = 0 self.pass_done.wr(True) if np.all(self.ofmap == self.reference): raise Finish("Success") else: print(self.ofmap) print(self.reference) print(self.ofmap - self.reference) raise Finish("Validation Failed")
class OutputDeserializer(Module): def instantiate(self, arch_output_chn, arr_x, arr_y, chn_per_word, finish_signal_chn): # PE static configuration (immutable) self.arr_x = arr_x self.arr_y = arr_y self.chn_per_word = chn_per_word self.arch_output_chn = arch_output_chn self.finish_signal_chn = finish_signal_chn self.ofmap = None self.ofmap_transformed = None self.reference = None self.image_size = (0, 0) self.curr_set = 0 self.fmap_idx = 0 self.pass_done = Reg(False) def configure(self, ofmap, reference, image_size, bias): # self.ofmap = np.zeros((2, 2, self.arr_x, 4)).astype(np.int64) # 2x2x8x4 self.ofmap = np.zeros((image_size[0], image_size[1], self.arr_x)).astype(np.int64) # 4x4x8 self.reference = reference self.num_tiles = 4 self.curr_tile = 0 self.image_size = image_size self.bias = bias # TODO self.curr_set = 0 self.fmap_idx = 0 self.curr_chn = 0 self.A_T = np.array([[1, 1, 1, 0], [0, 1, -1, -1]]) self.A = self.A_T.transpose() self.pass_done.wr(False) def tick(self): if self.pass_done.rd(): # partly parallelized to be on chip: # x_idx = (self.curr_tile // 2)*2 # y_idx = (self.curr_tile % 2)*2 # self.ofmap_transformed[x_idx:x_idx+2, y_idx:y_idx+2, self.curr_chn] += np.dot(self.A_T, np.dot(self.ofmap[:,:,self.curr_chn, self.curr_tile],self.A)) # self.curr_tile += 1 # if self.curr_tile == 4: # self.curr_tile = 0 # self.ofmap_transformed[:,:,self.curr_chn] += self.bias[self.curr_chn] # add bias # self.curr_chn += 1 # if self.curr_chn == 8: # print ("reference shape: ", self.reference.shape) # print ("ofmap shape: ", self.ofmap.shape) # FOR LOOPS USED B/C NOT COUNTING OFF CHIP PROCESSING IN PERFORMANCE STATISTICS (will unroll loops in on chip processing) # for k in range(8): # self.ofmap_transformed[:,:,k] += self.bias[k] # add bias # for t in range(self.num_tiles): # x_idx = (t // 2)*2 # y_idx = (t % 2)*2 # self.ofmap_transformed[x_idx:x_idx+2,y_idx:y_idx+2,k] += np.dot(self.A_T,np.dot(self.ofmap[:,:,k,t],self.A)) # self.finish_signal_chn.push(True) if np.all(self.ofmap == self.reference): raise Finish("Success") else: print("ofmap: ") print(self.ofmap) print("reference: ") print(self.reference) print("difference: ") print(self.ofmap - self.reference) raise Finish("Validation Failed") else: #print ("output deser curr_tile, fmap_idx: ", self.curr_tile, self.fmap_idx) out_sets = self.arr_x // self.chn_per_word # 2 fmap_per_iteration = 4 # ofmap size, parametrize .. TODO if self.arch_output_chn.valid(): data = [e for e in self.arch_output_chn.pop()] x_idx = (self.curr_tile // 2) * 2 y_idx = (self.curr_tile % 2) * 2 x = (self.fmap_idx % 2) + x_idx y = self.fmap_idx // 2 + y_idx # self.ofmap_transformed[x_idx:x_idx+2,y_idx:y_idx+2,k] += np.dot(self.A_T,np.dot(self.ofmap[:,:,k,t],self.A)) if self.curr_set < out_sets: cmin = self.curr_set * self.chn_per_word cmax = cmin + self.chn_per_word for c in range(cmin, cmax): self.ofmap[x, y, c] = data[c - cmin] self.curr_set += 1 if self.curr_set == out_sets: self.curr_set = 0 #self.fmap_idx += 1 self.curr_tile += 1 if self.curr_tile == 4: self.fmap_idx += 1 self.curr_tile = 0 if self.fmap_idx == fmap_per_iteration: self.fmap_idx = 0 self.curr_tile = 0 # self.ofmap = self.ofmap//(128*128) self.pass_done.wr(True)
def instantiate(self, check_fifo=True): self.check_fifo = check_fifo self.counter = Reg(0) self.fifo = FIFO(4)
class OutputDeserializer(Module): def instantiate(self, arch_output_chn, psum_chn, arr_x, arr_y, chn_per_word): # PE static configuration (immutable) self.arr_x = arr_x self.arr_y = arr_y self.chn_per_word = chn_per_word self.arch_output_chn = arch_output_chn self.psum_chn = psum_chn self.ofmap = None self.reference = None self.image_size = (0, 0) self.curr_set = 0 self.fmap_idx = 0 self.pass_done = Reg(False) def configure(self, ofmap, reference, image_size, curr_pass): if (curr_pass == 0): # so that ofmap doesnt get rewritten with zeros self.ofmap = ofmap self.reference = reference self.image_size = image_size self.curr_set = 0 self.fmap_idx = 0 self.curr_pass = curr_pass self.num_passes = 4 self.pass_done.wr(False) def tick(self): if self.pass_done.rd(): return out_sets = self.arr_x // self.chn_per_word fmap_per_iteration = self.image_size[0] * self.image_size[1] if self.arch_output_chn.valid() and (self.psum_chn.vacancy() or self.curr_pass % 2 == 1): data = [e for e in self.arch_output_chn.pop()] if ((self.curr_pass % 2) == 0): # push ofmap psum to serializer on pass 0 and 2 self.psum_chn.push(data) x = self.fmap_idx % self.image_size[0] y = self.fmap_idx // self.image_size[0] if self.curr_set < out_sets: channel_offset = 0 if (self.curr_pass > 1): channel_offset = 8 cmin = self.curr_set * self.chn_per_word + channel_offset cmax = cmin + self.chn_per_word for c in range(cmin, cmax): self.ofmap[x, y, c] = data[c - cmin] self.curr_set += 1 if self.curr_set == out_sets: self.curr_set = 0 self.fmap_idx += 1 if self.fmap_idx == fmap_per_iteration: self.fmap_idx = 0 if (self.curr_pass == self.num_passes - 1): self.pass_done.wr(True) if np.all(self.ofmap == self.reference): raise Finish("Success") else: print(self.ofmap) print(self.reference) print(self.ofmap - self.reference) raise Finish("Validation Failed")
def instantiate(self): self.ra = Reg(0) self.rb = Reg(10)
class InputSerializer(Module): def instantiate(self, arch_input_chn, arr_y, block_size, num_nonzero, pruner_name): # PE static configuration (immutable) #self.arr_x = arr_x self.arr_y = arr_y #self.chn_per_word = chn_per_word self.block_size = block_size self.num_nonzero = num_nonzero self.convert_chn = Channel() self.prune_chn = Channel() self.arch_input_chn = arch_input_chn # Although both InputSerializer and pruner will be pushing to arch_input_chn # There is no conflict issue because all weights will be pushed by IS first # then all inputs by pruner self.converter = Converter(self.convert_chn, self.prune_chn, \ self.block_size, self.block_size) # self.pruner = NaivePruner(self.prune_chn,self.arch_input_chn, \ # self.num_nonzero,True) #user defined pruner for this layer, default to naive pruner self.pruner = getattr(pruner, pruner_name)(self.prune_chn,self.arch_input_chn, \ self.num_nonzero, self.block_size, True) self.ifmap = None self.weights = None self.bias = None self.image_size = (0, 0) self.filter_size = (0, 0) self.ifmap_psum_done = True self.pass_done = Reg(False) # State Counters self.curr_set = 0 self.curr_filter = 0 self.iteration = 0 self.fmap_idx = 0 self.curr_chn = 0 self.curr_x = 0 # run through first two dimensions of input self.curr_y = 0 self.bias_set = 0 #self.send_bias = False def configure(self, ifmap, weights, bias, in_chn, out_chn, image_size, filter_size): self.ifmap = ifmap self.weights = weights self.bias = bias self.in_chn = in_chn self.out_chn = out_chn self.image_size = image_size self.filter_size = filter_size self.ifmap_psum_done = False self.weights_done = False self.pass_done.wr(False) # State Counters self.curr_set = 0 self.curr_filter = 0 self.iteration = 0 self.curr_chn = 0 self.curr_x = 0 # run through first two dimensions of input self.curr_y = 0 self.bias_set = 0 #self.send_bias = False def tick(self): if self.pass_done.rd(): return if self.ifmap_psum_done: if self.convert_chn.vacancy(): data = np.zeros(self.block_size) self.convert_chn.push(data) return in_sets = self.in_chn // self.block_size out_sets = self.out_chn // self.block_size num_iteration = self.filter_size[0] * self.filter_size[1] # read and hold all weights at the beginning for ease of implementation if not self.weights_done: f_x = self.iteration // self.filter_size[0] f_y = self.iteration % self.filter_size[0] # Push filters to PE columns. (PE is responsible for pop) if self.arch_input_chn.vacancy( ) and self.iteration < num_iteration: cmin = self.curr_filter * self.block_size cmax = cmin + self.block_size data = np.array([self.weights[f_x, f_y, self.curr_chn, c] \ for c in range(cmin, cmax) ]) #print("{},{},{},{}-{}".format(f_x,f_y,self.curr_chn,cmin,cmax)) #print(data) self.arch_input_chn.push( data) # Gives groups of four along num_filters axis self.curr_filter += 1 if (self.curr_filter == out_sets ): # Loop through blocks of filters self.curr_filter = 0 self.curr_chn += 1 if (self.curr_chn == self.in_chn): # Loop through channels self.curr_chn = 0 self.iteration += 1 if (self.iteration == num_iteration ): # Loop through 2D filter support self.iteration = 0 #print("Weights done") self.weights_done = True elif self.arch_input_chn.vacancy() and self.bias_set < out_sets: cmin = self.bias_set * self.block_size cmax = cmin + self.block_size data = np.array([self.bias[c] for c in range(cmin, cmax)]) #print("bias (input serializer):") #print(data) self.arch_input_chn.push(data) self.bias_set += 1 elif not self.ifmap_psum_done: if self.convert_chn.vacancy(): cmin = self.curr_set * self.block_size cmax = cmin + self.block_size #xmin = x #xmax = x+self.arr_x # Write ifmap to glb #data = np.array([ self.ifmap[x, self.curr_y, self.curr_chn] for x in range(xmin, xmax) ]) data = np.array([ self.ifmap[self.curr_x, self.curr_y, c] for c in range(cmin, cmax) ]) #print("{},{},{}-{}".format(self.curr_x, self.curr_y, cmin, cmax)) #print(data) self.curr_set += 1 if (self.curr_set == in_sets): self.curr_set = 0 self.curr_y += 1 if (self.curr_y == self.image_size[1]): self.curr_y = 0 self.curr_x += 1 self.convert_chn.push(data) if (self.curr_x == self.image_size[0]): self.curr_x = 0 self.ifmap_psum_done = True
class PreTransformWeights(Module): def instantiate(self, locx, locy, weight_in_chn, weight_out_chn): self.locx = locx self.locy = locy self.weight_in_chn = weight_in_chn self.weight_out_chn = weight_out_chn self.transform_done = Reg(False) self.stat_type = 'aggregate' self.raw_stats = { 'pre_tr_weights_alu_comp': 0, 'pre_tr_weights_rf_rd': 0, 'pre_tr_weights_rf_wr': 0 } def configure(self): self.iteration = 0 self.push_ctr = 0 self.U = np.zeros([4, 4]).astype(np.int64) self.transform_done.wr(False) # Explanation of algorithm: transform filter weights G into U, performing Winograd transform U = H*G*H_T # G = [G00 G01 G02 # G10 G11 G12 # G20 G21 G22] # # H = [1 0 0 # 0.5 0.5 0.5 # 0.5 -0.5 0.5 # 0 0 1 ] # # H_T = [1 0.5 0.5 0 # 0 0.5 -0.5 0 # 0 0.5 0.5 1] # # Performing this transform yields a 4x4 output for a given 4x4 input: # # U = [u00 u01 u02 u03 # u10 u11 u12 u13 # u20 u21 u22 u23 # u30 u31 u32 u33] # ... such that: # u00 = (G00)<<7; # u01 = (G00 + G01 + G02)<<6; # u02 = (G00 - G01 + G02)<<6; # u03 = (G02)<<7; # u10 = (G00 + G10 + G20)<<6; # u11 = (G00 + G01 + G02 + G10 + G11 + G12 + G20 + G21 + G22)<<5; # u12 = (G00 - G01 + G02 + G10 - G11 + G12 + G20 - G21 + G22)<<5; # u13 = (G02 + G12 + G22)<<6; # u20 = (G00 - G10 + G20)<<6; # u21 = (G00 + G01 + G02 - G10 - G11 - G12 + G20 + G21 + G22)<<5; # u22 = (G00 - G01 + G02 - G10 + G11 - G12 + G20 - G21 + G22)<<5; # u23 = (G02 - G12 + G22)<<6; # u30 = (G20)<<7; # u31 = (G20 + G21 + G22)<<6; # u32 = (G20 - G21 + G22)<<6; # u33 = (G22)<<7; def tick(self): if self.transform_done.rd(): return if self.weight_in_chn.valid() and self.weight_out_chn.vacancy(): g = (self.weight_in_chn.pop()) #print("pre tr weight: locx, locy, receive weight: ", self.locx, self.locy, g) if (self.iteration == 0): # get G_00 self.U[0][0] += g self.U[0][1] += g self.U[0][2] += g self.U[1][0] += g self.U[1][1] += g self.U[1][2] += g self.U[2][0] += g self.U[2][1] += g self.U[2][2] += g self.U[0][0] = self.U[0][0] * 128 # left shift by 7 self.raw_stats[ 'pre_tr_weights_alu_comp'] += 10 #9 adds, 1 shift self.raw_stats['pre_tr_weights_rf_rd'] += 9 self.raw_stats['pre_tr_weights_rf_wr'] += 9 self.weight_out_chn.push( self.U[self.push_ctr // 4][self.push_ctr % 4]) # send U00 self.raw_stats[ 'pre_tr_weights_rf_wr'] -= 1 # u00 sent immediately, not written back to rf #print ("pre transform weights - locx, locy, iteration, transformed weight: ", \ # self.locx, self.locy, self.iteration, self.U[self.push_ctr // 4][self.push_ctr % 4]) self.push_ctr += 1 self.iteration += 1 elif (self.iteration == 1): # get G_01 self.U[0][1] += g self.U[0][2] -= g self.U[1][1] += g self.U[1][2] -= g self.U[2][1] += g self.U[2][2] -= g self.raw_stats['pre_tr_weights_alu_comp'] += 6 self.raw_stats['pre_tr_weights_rf_rd'] += 6 self.raw_stats['pre_tr_weights_rf_wr'] += 6 self.iteration += 1 elif (self.iteration == 2): # get G_02 self.U[0][1] += g self.U[0][2] += g self.U[0][3] += g self.U[1][1] += g self.U[1][2] += g self.U[1][3] += g self.U[2][1] += g self.U[2][2] += g self.U[2][3] += g self.U[0][1] = self.U[0][1] * 64 # left shift by 6 self.U[0][2] = self.U[0][2] * 64 # left shift by 6 self.U[0][3] = self.U[0][3] * 128 # left shift by 7 self.raw_stats[ 'pre_tr_weights_alu_comp'] += 12 #9 adds/subt, 3 shift self.raw_stats['pre_tr_weights_rf_rd'] += 9 self.raw_stats['pre_tr_weights_rf_wr'] += 9 self.weight_out_chn.push( self.U[self.push_ctr // 4][self.push_ctr % 4]) # send U01 self.raw_stats[ 'pre_tr_weights_rf_wr'] -= 1 # u01 sent immediately, not written back to rf #print ("pre transform weights - locx, locy, iteration, transformed weight: ", \ # self.locx, self.locy, self.iteration, self.U[self.push_ctr // 4][self.push_ctr % 4]) self.push_ctr += 1 self.iteration += 1 elif (self.iteration == 3): # get G_10 self.U[1][0] += g self.U[1][1] += g self.U[1][2] += g self.U[2][0] -= g self.U[2][1] -= g self.U[2][2] -= g self.raw_stats['pre_tr_weights_alu_comp'] += 6 self.raw_stats['pre_tr_weights_rf_rd'] += 6 self.raw_stats['pre_tr_weights_rf_wr'] += 6 self.weight_out_chn.push( self.U[self.push_ctr // 4][self.push_ctr % 4]) # send U02 self.raw_stats['pre_tr_weights_rf_rd'] += 1 # read u02 from rf #print ("pre transform weights - locx, locy, iteration, transformed weight: ", \ # self.locx, self.locy, self.iteration, self.U[self.push_ctr // 4][self.push_ctr % 4]) self.push_ctr += 1 self.iteration += 1 elif (self.iteration == 4): # get G_11 self.U[1][1] += g self.U[1][2] -= g self.U[2][1] -= g self.U[2][2] += g self.raw_stats['pre_tr_weights_alu_comp'] += 4 self.raw_stats['pre_tr_weights_rf_rd'] += 4 self.raw_stats['pre_tr_weights_rf_wr'] += 4 self.weight_out_chn.push( self.U[self.push_ctr // 4][self.push_ctr % 4]) # send U03 self.raw_stats['pre_tr_weights_rf_rd'] += 1 # read u03 from rf #print ("pre transform weights - locx, locy, iteration, transformed weight: ", \ # self.locx, self.locy, self.iteration, self.U[self.push_ctr // 4][self.push_ctr % 4]) self.push_ctr += 1 self.iteration += 1 elif (self.iteration == 5): # get G_12 self.U[1][1] += g self.U[1][2] += g self.U[1][3] += g self.U[2][1] -= g self.U[2][2] -= g self.U[2][3] -= g self.raw_stats['pre_tr_weights_alu_comp'] += 6 self.raw_stats['pre_tr_weights_rf_rd'] += 6 self.raw_stats['pre_tr_weights_rf_wr'] += 6 # no new completed weights this round self.iteration += 1 elif (self.iteration == 6): # get G_20 self.U[1][0] += g self.U[1][1] += g self.U[1][2] += g self.U[2][0] += g self.U[2][1] += g self.U[2][2] += g self.U[3][0] += g self.U[3][1] += g self.U[3][2] += g self.U[1][0] = self.U[1][0] * 64 # left shift 6 self.U[2][0] = self.U[2][0] * 64 # left shift 6 self.U[3][0] = self.U[3][0] * 128 # left shift 7 self.raw_stats[ 'pre_tr_weights_alu_comp'] += 12 # 9 add, 3 shift ops self.raw_stats['pre_tr_weights_rf_rd'] += 9 self.raw_stats['pre_tr_weights_rf_wr'] += 9 self.weight_out_chn.push( self.U[self.push_ctr // 4][self.push_ctr % 4]) # send U10 self.raw_stats[ 'pre_tr_weights_rf_wr'] -= 1 # send u10 immediately, w/o writing to rf #print ("pre transform weights - locx, locy, iteration, transformed weight: ", \ # self.locx, self.locy, self.iteration, self.U[self.push_ctr // 4][self.push_ctr % 4]) self.push_ctr += 1 self.iteration += 1 elif (self.iteration == 7): # get G_21 self.U[1][1] += g self.U[1][2] -= g self.U[2][1] += g self.U[2][2] -= g self.U[3][1] += g self.U[3][2] -= g self.raw_stats[ 'pre_tr_weights_alu_comp'] += 6 # 9 add, 3 shift ops self.raw_stats['pre_tr_weights_rf_rd'] += 6 self.raw_stats['pre_tr_weights_rf_wr'] += 6 # no new completed weights this round self.iteration += 1 elif (self.iteration == 8): # get G_22 #print ("iteration 8") self.U[1][1] += g self.U[1][2] += g self.U[1][3] += g self.U[2][1] += g self.U[2][2] += g self.U[2][3] += g self.U[3][1] += g self.U[3][2] += g self.U[3][3] += g self.raw_stats['pre_tr_weights_alu_comp'] += 9 # 9 add self.raw_stats['pre_tr_weights_rf_rd'] += 9 self.raw_stats['pre_tr_weights_rf_wr'] += 9 self.iteration += 1 elif (self.iteration == 9 and self.weight_out_chn.vacancy()): #print ("iteration 9") self.U[1][1] = self.U[1][1] * 32 # left shift by 5 self.U[1][2] = self.U[1][2] * 32 # left shift by 5 self.U[1][3] = self.U[1][3] * 64 self.U[2][1] = self.U[2][1] * 32 self.U[2][2] = self.U[2][2] * 32 self.U[2][3] = self.U[2][3] * 64 self.U[3][1] = self.U[3][1] * 64 self.U[3][2] = self.U[3][2] * 64 self.U[3][3] = self.U[3][3] * 128 self.raw_stats['pre_tr_weights_alu_comp'] += 9 # 9 shift ops self.raw_stats['pre_tr_weights_rf_rd'] += 9 self.raw_stats['pre_tr_weights_rf_wr'] += 9 self.weight_out_chn.push(self.U[self.push_ctr // 4][self.push_ctr % 4]) # send U11 self.raw_stats[ 'pre_tr_weights_rf_wr'] -= 1 # send u11 immediately w/o writing to rf # print ("pre transform weights - locx, locy, iteration, transformed weight: ", \ # self.locx, self.locy, self.iteration, self.U[self.push_ctr // 4][self.push_ctr % 4]) self.push_ctr += 1 self.iteration += 1 elif self.iteration == 10 and self.weight_out_chn.vacancy( ): # finish pushing transformed weights self.weight_out_chn.push(self.U[self.push_ctr // 4][self.push_ctr % 4]) self.raw_stats['pre_tr_weights_rf_rd'] += 1 # read uXX from rf #print ("pre transform weights - locx, locy, iteration, transformed weight: ", \ # self.locx, self.locy, self.iteration, self.U[self.push_ctr // 4][self.push_ctr % 4]) self.push_ctr += 1 if self.push_ctr == 16: # all 16 transformed weight values have been pushed self.transform_done.wr(True)
class PostTransform(Module): def instantiate(self, locx, locy, bias_chn, ofmap_in_chn, ofmap_out_chn): #ofmap_in self.locx = locx self.locy = locy self.bias_chn = bias_chn self.ofmap_in_chn = ofmap_in_chn self.ofmap_out_chn = ofmap_out_chn self.transform_done = Reg(False) self.stat_type = 'aggregate' self.raw_stats = {'post_tr_alu_comp' : 0, 'post_tr_rf_rd' : 0, 'post_tr_ifmap_rf_wr' : 0} def configure(self): self.bias = 0 self.iteration = 0 self.y00 = None self.y01 = None self.y10 = None self.y11 = None self.transform_done.wr(False) self.bias_read = False # Explanation of algorithm: transform ofmap M into y, performing inverse Winograd transform y = A_T*M*A # M = [M00 M01 M02 M03 # M10 M11 M12 M13 # M20 M21 M22 M23 # M30 M31 M32 M33] # # A_T = [1 1 1 0 # 0 1 -1 -1] # # A = [1 0 # 1 1 # 1 -1 # 0 -1] # # Performing this transform yields a 2x2 output for a given 4x4 input: # # y = [y00 y01 # y10 y11] # ... such that: # y00 = M00+M01+M02+M10+M11+M12+M20+M21+M22 # y01 = M01-M02-M03+M11-M12-M13+M21-M22-M23 # y10 = M10+M11+M12-M20-M21-M22-M30-M31-M32 # y11 = M11-M12-M13-M21+M22+M23-M31+M32+M33 def tick(self): if self.transform_done.rd(): return if self.bias_chn.valid(): # should only ever be valid once self.bias = self.bias_chn.pop() self.bias_read = True self.y00 = self.bias self.y01 = self.bias self.y10 = self.bias self.y11 = self.bias self.raw_stats['post_tr_alu_comp'] += 4 self.raw_stats['post_tr_ifmap_rf_wr'] += 4 elif self.ofmap_in_chn.valid() and self.ofmap_out_chn.vacancy(): m = (self.ofmap_in_chn.pop())//(128) # right shift by 7 bits self.raw_stats['post_tr_alu_comp'] += 1 #print("post tr -- iteration ", self.iteration) if (self.iteration == 0): # get M_00 self.y00 += m self.raw_stats['post_tr_alu_comp'] += 1 self.raw_stats['post_tr_rf_rd'] += 1 self.raw_stats['post_tr_ifmap_rf_wr'] += 1 self.iteration += 1 elif (self.iteration == 1): # get M_01 self.y00 += m self.y01 += m self.raw_stats['post_tr_alu_comp'] += 2 self.raw_stats['post_tr_rf_rd'] += 2 self.raw_stats['post_tr_ifmap_rf_wr'] += 2 self.iteration += 1 elif (self.iteration == 2): # get M_02 self.y00 += m self.y01 -= m self.raw_stats['post_tr_alu_comp'] += 2 self.raw_stats['post_tr_rf_rd'] += 2 self.raw_stats['post_tr_ifmap_rf_wr'] += 2 self.iteration += 1 elif (self.iteration == 3): # get M_03 self.y01 -= m self.raw_stats['post_tr_alu_comp'] += 1 self.raw_stats['post_tr_rf_rd'] += 1 self.raw_stats['post_tr_ifmap_rf_wr'] += 1 self.iteration += 1 elif (self.iteration == 4): # get M_10 self.y00 += m self.y10 += m self.raw_stats['post_tr_alu_comp'] += 2 self.raw_stats['post_tr_rf_rd'] += 2 self.raw_stats['post_tr_ifmap_rf_wr'] += 2 self.iteration += 1 elif (self.iteration == 5): # get M_11 self.y00 += m self.y01 += m self.y10 += m self.y11 += m self.raw_stats['post_tr_alu_comp'] += 4 self.raw_stats['post_tr_rf_rd'] += 4 self.raw_stats['post_tr_ifmap_rf_wr'] += 4 self.iteration += 1 elif (self.iteration == 6): # get M_12 self.y00 += m self.y01 -= m self.y10 += m self.y11 -= m self.raw_stats['post_tr_alu_comp'] += 4 self.raw_stats['post_tr_rf_rd'] += 4 self.raw_stats['post_tr_ifmap_rf_wr'] += 4 self.iteration += 1 elif (self.iteration == 7): # get M_13 self.y01 -= m self.y11 -= m self.raw_stats['post_tr_alu_comp'] += 2 self.raw_stats['post_tr_rf_rd'] += 2 self.raw_stats['post_tr_ifmap_rf_wr'] += 2 self.iteration += 1 elif (self.iteration == 8): # get M_20 self.y00 += m self.y10 -= m self.raw_stats['post_tr_alu_comp'] += 2 self.raw_stats['post_tr_rf_rd'] += 2 self.raw_stats['post_tr_ifmap_rf_wr'] += 2 self.iteration += 1 elif (self.iteration == 9): # get M_21 self.y00 += m self.y01 += m self.y10 -= m self.y11 -= m self.raw_stats['post_tr_alu_comp'] += 4 self.raw_stats['post_tr_rf_rd'] += 4 self.raw_stats['post_tr_ifmap_rf_wr'] += 4 self.iteration += 1 elif (self.iteration == 10 and self.bias_read == True): # get M_22 self.y00 += m self.y01 -= m self.y10 -= m self.y11 += m self.raw_stats['post_tr_alu_comp'] += 4 self.raw_stats['post_tr_rf_rd'] += 4 self.raw_stats['post_tr_ifmap_rf_wr'] += 4 self.iteration += 1 #print("post tr pushing y00: ", self.y00, self.bias) self.ofmap_out_chn.push(self.y00) # y00 done self.raw_stats['post_tr_ifmap_rf_wr'] -= 1 # send y00 immediately w/o writing to rf elif (self.iteration == 11): # get M_23 self.y01 -= m self.y11 += m self.raw_stats['post_tr_alu_comp'] += 2 self.raw_stats['post_tr_rf_rd'] += 2 self.raw_stats['post_tr_ifmap_rf_wr'] += 2 self.iteration += 1 #print("post tr pushing y01: ", self.y01, self.bias) self.ofmap_out_chn.push(self.y01) # y01 done self.raw_stats['post_tr_ifmap_rf_wr'] -= 1 # send y01 immediately w/o writing to rf elif (self.iteration == 12): # get M_30 self.y10 -= m self.raw_stats['post_tr_alu_comp'] += 1 self.raw_stats['post_tr_rf_rd'] += 1 self.raw_stats['post_tr_ifmap_rf_wr'] += 1 self.iteration += 1 elif (self.iteration == 13): # get M_31 self.y10 -= m self.y11 -= m self.raw_stats['post_tr_alu_comp'] += 2 self.raw_stats['post_tr_rf_rd'] += 2 self.raw_stats['post_tr_ifmap_rf_wr'] += 2 self.iteration += 1 elif (self.iteration == 14): # get M_32 self.y10 -= m self.y11 += m self.raw_stats['post_tr_alu_comp'] += 2 self.raw_stats['post_tr_rf_rd'] += 2 self.raw_stats['post_tr_ifmap_rf_wr'] += 2 self.iteration += 1 #print("post tr pushing y10: ", self.y10, self.bias) self.ofmap_out_chn.push(self.y10) # y10 done self.raw_stats['post_tr_ifmap_rf_wr'] -= 1 # send y10 immediately w/o writing to rf elif (self.iteration == 15): # get M_33 self.y11 += m self.raw_stats['post_tr_alu_comp'] += 1 self.raw_stats['post_tr_rf_rd'] += 1 self.raw_stats['post_tr_ifmap_rf_wr'] += 1 self.iteration += 1 #print("post tr pushing y11: ", self.y11, self.bias) self.ofmap_out_chn.push(self.y11) # y11 done self.raw_stats['post_tr_ifmap_rf_wr'] -= 1 # send y11 immediately w/o writing to rf #self.iteration += 1 if self.iteration == 16: self.transform_done.wr(True)
class Channel(Module): def instantiate(self, depth=2): self.data = [None] * depth self.depth = depth self.rd_ptr = Reg(0) self.wr_ptr = Reg(0) def peek(self, idx=0): if not self.valid(idx): raise ChannelError("Reading from empty channel") return self.data[(self.rd_ptr.rd() + idx) % self.depth] def push(self, x): if not self.vacancy(): raise ChannelError("Enqueueing into full channel") self.data[self.wr_ptr.rd() % self.depth] = x self.wr_ptr.wr((self.wr_ptr.rd() + 1) % (2 * self.depth)) def free(self, count=1): if not self.valid(count - 1): raise ChannelError("Dequeueing from empty channel") self.rd_ptr.wr((self.rd_ptr.rd() + count) % (2 * self.depth)) def pop(self): self.free(1) return self.peek(0) def valid(self, idx=0): return ((self.wr_ptr.rd() - self.rd_ptr.rd()) % (2 * self.depth)) > idx def vacancy(self, idx=0): return ((self.rd_ptr.rd() + self.depth - self.wr_ptr.rd()) % (2 * self.depth)) > idx def clear(self): # Use with care since it conflicts with enq and deq self.rd_ptr.wr(self.wr_ptr.rd())
class InputSerializer(Module): def instantiate(self, arch_input_chn, arr_x, arr_y, chn_per_word): # PE static configuration (immutable) self.arr_x = arr_x self.arr_y = arr_y self.chn_per_word = chn_per_word self.arch_input_chn = arch_input_chn self.ifmap = None self.weights = None self.bias = None self.image_size = (0, 0) self.filter_size = (0, 0) self.ifmap_psum_done = True self.pass_done = Reg(False) # State Counters self.curr_set = 0 self.curr_filter = 0 self.iteration = 0 self.fmap_idx = 0 self.bias_idx = 0 self.weight_idx = 0 def configure(self, ifmap, weights, bias, image_size, filter_size): self.ifmap = ifmap self.weights = weights self.bias = bias self.image_size = image_size self.filter_size = filter_size self.bias_wr_done = False self.fmap_wr_done = False self.weight_wr_done = False self.pass_done.wr(False) self.send_ifmap = True # used to interleave sending weights and ifmaps to chip self.bias_sets = 2 def tick(self): if self.pass_done.rd(): return # in_sets = self.arr_y//self.chn_per_word # out_sets = self.arr_x//self.chn_per_word fmap_per_iteration = self.image_size[0] * self.image_size[1] num_iteration = self.filter_size[0] * self.filter_size[1] weights_per_filter = self.filter_size[0] * self.filter_size[1] if self.arch_input_chn.vacancy() and not self.pass_done.rd(): if not self.bias_wr_done: kmin = self.bias_idx * self.chn_per_word kmax = kmin + self.chn_per_word data = np.array([self.bias[k] for k in range(kmin, kmax)]) self.bias_idx += 1 #print ("input ser kmin,kmax,bias: ",kmin,kmax,data) elif (not self.fmap_wr_done) and self.send_ifmap: # send ifmap # send 4 elements of ifmap x = self.fmap_idx % self.image_size[0] y = self.fmap_idx // self.image_size[0] cmin = self.curr_set * self.chn_per_word # 0 cmax = cmin + self.chn_per_word # 4 data = np.array( [self.ifmap[x, y, c] for c in range(cmin, cmax)]) self.fmap_idx += 1 #print ("input ser x,y,cmin,cmax,ifmaps: ",x,y,cmin,cmax,data) self.send_ifmap = False else: # send weight # send 4 elements of weights (twice in succession) x = self.weight_idx % self.filter_size[0] y = self.weight_idx // self.filter_size[1] cmin = 0 cmax = cmin + self.chn_per_word data = np.array([ self.weights[x, y, c, self.curr_filter] for c in range(cmin, cmax) ]) self.curr_filter += 1 if (not self.fmap_wr_done): self.send_ifmap = True #print ("input ser x,y,cmin,cmax,curr_filter,weights: ",x,y,cmin,cmax,self.curr_filter,data) self.arch_input_chn.push(data) if self.fmap_idx == fmap_per_iteration: self.fmap_wr_done = True self.fmap_idx = 0 if self.curr_filter == self.arr_x: self.weight_idx += 1 self.curr_filter = 0 if self.weight_idx == weights_per_filter: self.weight_wr_done = True self.pass_done.wr(True) if self.bias_idx == self.bias_sets: #2 self.bias_wr_done = True
def instantiate(self, depth=2): self.data = [None] * depth self.depth = depth self.rd_ptr = Reg(0) self.wr_ptr = Reg(0)
class InputSerializer(Module): def instantiate(self, arch_input_chn, arr_x, arr_y, chn_per_word): # PE static configuration (immutable) self.arr_x = arr_x self.arr_y = arr_y self.chn_per_word = chn_per_word self.arch_input_chn = arch_input_chn self.ifmap = None self.weights = None self.image_size = (0, 0) self.filter_size = (0, 0) self.ifmap_psum_done = True self.pass_done = Reg(False) # State Counters self.curr_set = 0 self.curr_filter = 0 self.iteration = 0 self.fmap_idx = 0 self.bias_idx = 0 def configure(self, ifmap, weights, biases, image_size, filter_size): #self.ifmap = ifmap #self.weights = weights self.biases = biases self.image_size = image_size self.filter_size = image_size self.num_tiles = 4 self.send_ifmap = True self.fmap_idx = 0 self.fmap_tile = 0 self.weight_idx = 0 self.bias_sets = 2 self.fmap_wr_done = False self.weight_wr_done = False self.bias_wr_done = False self.pass_done.wr(False) # pad the ifmaps ifmap_padded = np.pad(ifmap, 1, 'constant') ifmap_padded = ifmap_padded[:, :, 1:5] # Winograd transforms B_T = np.array([[1, 0, -1, 0], [0, 1, 1, 0], [0, -1, 1, 0], [0, 1, 0, -1]]) B = B_T.transpose() G = np.array([[1, 0, 0], [0.5, 0.5, 0.5], [0.5, -0.5, 0.5], [0, 0, 1]]) G_T = G.transpose() A_T = np.array([[1, 1, 1, 0], [0, 1, -1, -1]]) A = A_T.transpose() C = 4 # num channels K = 8 # num filters T = 4 # num tiles U = np.zeros([4, 4, C, K]) # 4,4,4,8 V = np.zeros([4, 4, C, T]) # 4,4,4 # FOR LOOPS USED B/C NOT COUNTING OFF CHIP PROCESSING IN PERFORMANCE STATISTICS (will unroll loops in on chip processing) for t in range(T): for k in range(K): # filter for c in range(C): # channel g = weights[:, :, c, k] # 3x3 filter U[:, :, c, k] = np.dot(G, np.dot(g, G_T)) # 4x4 for c in range(C): # channel x_idx = (t // 2) * 2 y_idx = (t % 2) * 2 d = ifmap_padded[x_idx:x_idx + 4, y_idx:y_idx + 4, c] # 4x4 ifmap tile V[:, :, c, t] = np.dot(B_T, np.dot(d, B)) # Convert to integers for on chip processing, LOSE ACCURACY -> bit shift U = 128 * U # left shift by 7 bits to avoid precision loss when convert float to int #V = 128*V; self.weights = U.astype(np.int64) # transformed weights self.ifmap = V.astype(np.int64) # transformed ifmap #print ("U ser: ", self.weights) #print ("V ser: ", self.ifmap) def tick(self): if self.pass_done.rd(): return in_sets = self.arr_y // self.chn_per_word # 1 out_sets = self.arr_x // self.chn_per_word # 2 fmap_per_iteration = self.image_size[0] * self.image_size[1] weights_per_filter = self.filter_size[0] * self.filter_size[1] if self.arch_input_chn.vacancy() and not self.pass_done.rd(): if not self.bias_wr_done: kmin = self.bias_idx * self.chn_per_word kmax = kmin + self.chn_per_word data = np.array([self.biases[k] for k in range(kmin, kmax)]) self.bias_idx += 1 #print ("input ser kmin,kmax,biases: ",kmin,kmax,data) elif not self.fmap_wr_done: # send ifmap # send 4 elements of ifmap x = self.fmap_idx % self.image_size[0] y = self.fmap_idx // self.image_size[0] cmin = self.curr_set * self.chn_per_word # 0 cmax = cmin + self.chn_per_word # 4 data = np.array([ self.ifmap[x, y, c, self.fmap_tile] for c in range(cmin, cmax) ]) self.fmap_tile += 1 #print ("input ser x,y,cmin,cmax,ifmaps: ",x,y,cmin,cmax,data) else: # send weight # send 4 elements of weights (twice in succession) x = self.weight_idx % self.filter_size[0] y = self.weight_idx // self.filter_size[1] cmin = 0 cmax = cmin + self.chn_per_word data = np.array([ self.weights[x, y, c, self.curr_filter] for c in range(cmin, cmax) ]) self.curr_filter += 1 #print ("input ser x,y,cmin,cmax,curr_filter,weights: ",x,y,cmin,cmax,self.curr_filter,data) self.arch_input_chn.push(data) if self.fmap_tile == self.num_tiles: self.fmap_tile = 0 self.fmap_idx += 1 if self.fmap_idx == fmap_per_iteration: self.fmap_wr_done = True self.fmap_idx = 0 if self.curr_filter == self.arr_x: self.weight_idx += 1 self.curr_filter = 0 if self.weight_idx == weights_per_filter: self.weight_wr_done = True self.pass_done.wr(True) if self.bias_idx == self.bias_sets: #2 self.bias_wr_done = True
class OutputDeserializer(Module): def instantiate(self, arch_output_chn, arr_y, block_size, num_nonzero): # PE static configuration (immutable) self.arr_y = arr_y self.block_size = block_size self.num_nonzero = num_nonzero self.arch_output_chn = arch_output_chn self.ofmap = None self.reference = None self.image_size = (0, 0) self.curr_set = 0 self.fmap_idx = 0 self.pass_done = Reg(False) def configure(self, ofmap, reference, image_size, out_chn): self.ofmap = ofmap self.reference = reference self.out_chn = out_chn self.image_size = image_size self.curr_set = 0 self.fmap_idx = 0 self.pass_done.wr(False) def tick(self): if self.pass_done.rd(): return out_sets = self.out_chn // self.block_size fmap_per_iteration = self.image_size[0] * self.image_size[1] if self.arch_output_chn.valid(): rcvd = self.arch_output_chn.pop() loc_tag = [e[0] for e in rcvd] data = [e[1] for e in rcvd] #print(loc_tag) x = loc_tag[0] // self.image_size[1] y = loc_tag[0] % self.image_size[1] #x = self.fmap_idx % self.image_size[0] #y = self.fmap_idx // self.image_size[0] self.fmap_idx = x + y * self.image_size[0] #print("{},{} received (output deserializer)".format(x,y)) #print(data) if self.curr_set < out_sets: cmin = self.curr_set * self.block_size cmax = cmin + self.block_size for c in range(cmin, cmax): assert (self.ofmap[x, y, c] == 0 ) # should never replace an existing value self.ofmap[x, y, c] = data[c - cmin] self.curr_set += 1 if self.curr_set == out_sets: self.curr_set = 0 self.fmap_idx += 1 if self.fmap_idx == fmap_per_iteration: self.fmap_idx = 0 self.pass_done.wr(True) raise Finish("Done processing")