예제 #1
0
class RegTB(Module):
    def instantiate(self):
        self.ra = Reg(0)
        self.rb = Reg(10)

    def tick(self):
        print("ra val: %d" % self.ra.rd())
        print("rb val: %d" % self.rb.rd())
        self.ra.wr(self.ra.rd() + 1)
        self.rb.wr(self.ra.rd() + self.rb.rd())
예제 #2
0
class FIFOTB(Module):
    def instantiate(self, check_fifo=True):
        self.check_fifo = check_fifo
        self.counter = Reg(0)
        self.fifo = FIFO(4)

    def tick(self):
        count = self.counter.rd()
        self.counter.wr((count + 1) % 256)

        if count % 4 < 2 and (self.fifo.not_full() or not self.check_fifo):
            self.fifo.enq(count)
            print("enq: %d" % count)

        if count % 4 == 3 and (self.fifo.not_empty() or not self.check_fifo):
            peek = self.fifo.peek()
            self.fifo.deq()
            print("deq: %d" % peek)
예제 #3
0
class FIFO(Module):
    def instantiate(self, depth=2):
        self.data = [0] * depth
        self.depth = depth

        self.rd_ptr = Reg(0)
        self.wr_ptr = Reg(0)

    def peek(self):
        if self.wr_ptr.rd() == self.rd_ptr.rd():
            raise FIFOError("Reading from empty FIFO")
        return self.data[self.rd_ptr.rd() % self.depth]

    def enq(self, x):
        if (self.wr_ptr.rd() - self.rd_ptr.rd()) % (2 *
                                                    self.depth) == self.depth:
            raise FIFOError("Enqueueing into full FIFO")
        self.data[self.wr_ptr.rd() % self.depth] = x
        self.wr_ptr.wr((self.wr_ptr.rd() + 1) % (2 * self.depth))

    def deq(self):
        if self.wr_ptr.rd() == self.rd_ptr.rd():
            raise FIFOError("Dequeueing from empty FIFO")
        self.rd_ptr.wr((self.rd_ptr.rd() + 1) % (2 * self.depth))

    def not_full(self):
        return not ((self.wr_ptr.rd() - self.rd_ptr.rd()) %
                    (2 * self.depth) == self.depth)

    def not_empty(self):
        return not (self.wr_ptr.rd() == self.rd_ptr.rd())

    def reset(self):
        self.rd_ptr.wr(self.wr_ptr.rd())
예제 #4
0
class WindowFIFO(Module):
    def instantiate(self, depth, peek_window, enq_window, deq_window):
        self.data = [0] * depth
        self.depth = depth
        self.peek_window = peek_window
        self.enq_window = enq_window
        self.deq_window = deq_window

        self.rd_ptr = Reg(0)
        self.wr_ptr = Reg(0)

    def peek(self):
        if (self.wr_ptr.rd() - self.rd_ptr.rd()) % (2*self.depth) \
                < self.peek_window:
            raise FIFOError("Reading from empty FIFO")
        peek_output = [0] * self.peek_window
        for i in xrange(self.peek_window):
            peek_output[i] = self.data[(self.rd_ptr.rd() + i) % self.depth]
        return peek_output

    def enq(self, x):
        if (self.wr_ptr.rd() - self.rd_ptr.rd() + self.enq_window - 1) % \
                (2*self.depth) >= self.depth:
            raise FIFOError("Enqueueing into full FIFO")
        for i in xrange(self.enq_window):
            self.data[(self.wr_ptr.rd() + i) % self.depth] = x[i]
        self.wr_ptr.wr((self.wr_ptr.rd() + self.enq_window) % (2 * self.depth))

    def deq(self):
        if (self.wr_ptr.rd() - self.rd_ptr.rd()) % (2*self.depth) \
                < self.deq_window:
            raise FIFOError("Dequeueing from empty FIFO")
        self.rd_ptr.wr((self.rd_ptr.rd() + self.deq_window) % (2 * self.depth))

    def not_full(self):
        return not (
            (self.wr_ptr.rd() - self.rd_ptr.rd() + self.enq_window - 1) %
            (2 * self.depth) >= self.depth)

    def valid(self):
        return not ((self.wr_ptr.rd() - self.rd_ptr.rd()) %
                    (2 * self.depth) < self.peek_window)

    def not_empty(self):
        return not ((self.wr_ptr.rd() - self.rd_ptr.rd()) %
                    (2 * self.depth) < self.deq_window)

    def clear(self):
        self.rd_ptr.wr(self.wr_ptr.rd())
예제 #5
0
파일: channel.py 프로젝트: bshimanuki/6.888
class Channel(Module):
    def instantiate(self, depth=2, name=None):
        self.data = [None] * depth
        self.depth = depth
        self.name = name

        self.rd_ptr = Reg(0)
        self.wr_ptr = Reg(0)

    def peek(self, idx=0):
        if not self.valid(idx):
            raise ChannelError("Reading from empty channel")
        return self.data[(self.rd_ptr.rd() + idx) % self.depth]

    def push(self, x):
        if not self.vacancy():
            raise ChannelError("Enqueueing into full channel")
        self.data[self.wr_ptr.rd() % self.depth] = x
        self.wr_ptr.wr((self.wr_ptr.rd() + 1) % (2 * self.depth))

        if self.name != None:
            self.output_file.write("chn {} push\n".format(self.name))

    def free(self, count=1):
        if not self.valid(count - 1):
            raise ChannelError("Dequeueing from empty channel")
        self.rd_ptr.wr((self.rd_ptr.rd() + count) % (2 * self.depth))

    def pop(self):
        self.free(1)
        ret = self.peek(0)

        if self.name != None:
            self.output_file.write("chn {} pop\n".format(self.name))
        return ret

    def valid(self, idx=0):
        return ((self.wr_ptr.rd() - self.rd_ptr.rd()) % (2 * self.depth)) > idx

    def vacancy(self, idx=0):
        return ((self.rd_ptr.rd() + self.depth - self.wr_ptr.rd()) %
                (2 * self.depth)) > idx

    def clear(self):
        # Use with care since it conflicts with enq and deq
        self.rd_ptr.wr(self.wr_ptr.rd())
예제 #6
0
class Channel(Module):
    def instantiate(self, depth=2):
        self.data = [None] * depth
        self.depth = depth

        self.rd_ptr = Reg(0)
        self.wr_ptr = Reg(0)

    def peek(self, idx=0):
        if not self.valid(idx):
            raise ChannelError("Reading from empty channel")
        return self.data[(self.rd_ptr.rd() + idx) % self.depth]

    def push(self, x):
        if not self.vacancy():
            raise ChannelError("Enqueueing into full channel")
        self.data[self.wr_ptr.rd() % self.depth] = x
        self.wr_ptr.wr((self.wr_ptr.rd() + 1) % (2 * self.depth))

    def free(self, count=1):
        if not self.valid(count - 1):
            raise ChannelError("Dequeueing from empty channel")
        self.rd_ptr.wr((self.rd_ptr.rd() + count) % (2 * self.depth))

    def pop(self):
        self.free(1)
        return self.peek(0)

    def valid(self, idx=0):  # check not empty
        return ((self.wr_ptr.rd() - self.rd_ptr.rd()) % (2 * self.depth)) > idx

    def vacancy(self, idx=0):  # check not full
        return ((self.rd_ptr.rd() + self.depth - self.wr_ptr.rd()) %
                (2 * self.depth)) > idx

    def clear(self):
        # Use with care since it conflicts with enq and deq
        self.rd_ptr.wr(self.wr_ptr.rd())
예제 #7
0
class InputSerializer(Module):
    def instantiate(self, arch_input_chn, arr_x, arr_y, chn_per_word):
        # PE static configuration (immutable)
        self.arr_x = arr_x
        self.arr_y = arr_y
        self.chn_per_word = chn_per_word

        self.arch_input_chn = arch_input_chn

        self.ifmap = None
        self.weights = None

        self.image_size = (0, 0)
        self.filter_size = (0, 0)

        self.ifmap_psum_done = True
        self.pass_done = Reg(False)

        # State Counters
        self.curr_set = 0
        self.curr_filter = 0
        self.iteration = 0
        self.fmap_idx = 0
        self.bias_idx = 0

    def configure(self, ifmap, weights, biases, image_size, filter_size):
        #self.ifmap = ifmap
        #self.weights = weights
        self.biases = biases

        self.image_size = image_size
        self.filter_size = image_size
        self.num_tiles = 4

        self.send_ifmap = True
        self.fmap_idx = 0
        self.fmap_tile = 0
        self.weight_idx = 0

        self.bias_sets = 2

        self.fmap_wr_done = False
        self.weight_wr_done = False
        self.bias_wr_done = False
        self.pass_done.wr(False)

        # pad the ifmaps
        ifmap_padded = np.pad(ifmap, 1, 'constant')
        ifmap_padded = ifmap_padded[:, :, 1:5]

        # Winograd transforms
        B_T = np.array([[1, 0, -1, 0], [0, 1, 1, 0], [0, -1, 1, 0],
                        [0, 1, 0, -1]])
        B = B_T.transpose()
        G = np.array([[1, 0, 0], [0.5, 0.5, 0.5], [0.5, -0.5, 0.5], [0, 0, 1]])
        G_T = G.transpose()
        A_T = np.array([[1, 1, 1, 0], [0, 1, -1, -1]])
        A = A_T.transpose()

        C = 4  # num channels
        K = 8  # num filters
        T = 4  # num tiles
        U = np.zeros([4, 4, C, K])  # 4,4,4,8
        V = np.zeros([4, 4, C, T])  # 4,4,4
        # FOR LOOPS USED B/C NOT COUNTING OFF CHIP PROCESSING IN PERFORMANCE STATISTICS (will unroll loops in on chip processing)
        for t in range(T):
            for k in range(K):  # filter
                for c in range(C):  # channel
                    g = weights[:, :, c, k]  # 3x3 filter
                    U[:, :, c, k] = np.dot(G, np.dot(g, G_T))  # 4x4
            for c in range(C):  # channel
                x_idx = (t // 2) * 2
                y_idx = (t % 2) * 2
                d = ifmap_padded[x_idx:x_idx + 4, y_idx:y_idx + 4,
                                 c]  # 4x4 ifmap tile
                V[:, :, c, t] = np.dot(B_T, np.dot(d, B))
        # Convert to integers for on chip processing, LOSE ACCURACY -> bit shift
        U = 128 * U
        # left shift by 7 bits to avoid precision loss when convert float to int
        #V = 128*V;
        self.weights = U.astype(np.int64)  # transformed weights
        self.ifmap = V.astype(np.int64)  # transformed ifmap
        #print ("U ser: ", self.weights)
        #print ("V ser: ", self.ifmap)

    def tick(self):
        if self.pass_done.rd():
            return

        in_sets = self.arr_y // self.chn_per_word  # 1
        out_sets = self.arr_x // self.chn_per_word  # 2

        fmap_per_iteration = self.image_size[0] * self.image_size[1]
        weights_per_filter = self.filter_size[0] * self.filter_size[1]

        if self.arch_input_chn.vacancy() and not self.pass_done.rd():
            if not self.bias_wr_done:
                kmin = self.bias_idx * self.chn_per_word
                kmax = kmin + self.chn_per_word
                data = np.array([self.biases[k] for k in range(kmin, kmax)])
                self.bias_idx += 1
                #print ("input ser kmin,kmax,biases: ",kmin,kmax,data)
            elif not self.fmap_wr_done:  # send ifmap
                # send 4 elements of ifmap
                x = self.fmap_idx % self.image_size[0]
                y = self.fmap_idx // self.image_size[0]
                cmin = self.curr_set * self.chn_per_word  # 0
                cmax = cmin + self.chn_per_word  # 4
                data = np.array([
                    self.ifmap[x, y, c, self.fmap_tile]
                    for c in range(cmin, cmax)
                ])
                self.fmap_tile += 1
                #print ("input ser x,y,cmin,cmax,ifmaps: ",x,y,cmin,cmax,data)
            else:  # send weight
                # send 4 elements of weights (twice in succession)
                x = self.weight_idx % self.filter_size[0]
                y = self.weight_idx // self.filter_size[1]
                cmin = 0
                cmax = cmin + self.chn_per_word
                data = np.array([
                    self.weights[x, y, c, self.curr_filter]
                    for c in range(cmin, cmax)
                ])
                self.curr_filter += 1
                #print ("input ser x,y,cmin,cmax,curr_filter,weights: ",x,y,cmin,cmax,self.curr_filter,data)
            self.arch_input_chn.push(data)
            if self.fmap_tile == self.num_tiles:
                self.fmap_tile = 0
                self.fmap_idx += 1
            if self.fmap_idx == fmap_per_iteration:
                self.fmap_wr_done = True
                self.fmap_idx = 0
            if self.curr_filter == self.arr_x:
                self.weight_idx += 1
                self.curr_filter = 0
            if self.weight_idx == weights_per_filter:
                self.weight_wr_done = True
                self.pass_done.wr(True)
            if self.bias_idx == self.bias_sets:  #2
                self.bias_wr_done = True
예제 #8
0
class OutputDeserializer(Module):
    def instantiate(self, arch_output_chn, arr_x, arr_y, chn_per_word,
                    finish_signal_chn):
        # PE static configuration (immutable)
        self.arr_x = arr_x
        self.arr_y = arr_y
        self.chn_per_word = chn_per_word

        self.arch_output_chn = arch_output_chn

        self.finish_signal_chn = finish_signal_chn

        self.ofmap = None
        self.ofmap_transformed = None
        self.reference = None

        self.image_size = (0, 0)

        self.curr_set = 0
        self.fmap_idx = 0

        self.pass_done = Reg(False)

    def configure(self, ofmap, reference, image_size, bias):
        #       self.ofmap = np.zeros((2, 2, self.arr_x, 4)).astype(np.int64) # 2x2x8x4
        self.ofmap = np.zeros((image_size[0], image_size[1],
                               self.arr_x)).astype(np.int64)  # 4x4x8
        self.reference = reference
        self.num_tiles = 4
        self.curr_tile = 0

        self.image_size = image_size
        self.bias = bias  # TODO

        self.curr_set = 0
        self.fmap_idx = 0
        self.curr_chn = 0
        self.A_T = np.array([[1, 1, 1, 0], [0, 1, -1, -1]])
        self.A = self.A_T.transpose()

        self.pass_done.wr(False)

    def tick(self):
        if self.pass_done.rd():
            # partly parallelized to be on chip:
            #            x_idx = (self.curr_tile // 2)*2
            #            y_idx = (self.curr_tile % 2)*2
            #            self.ofmap_transformed[x_idx:x_idx+2, y_idx:y_idx+2, self.curr_chn] += np.dot(self.A_T, np.dot(self.ofmap[:,:,self.curr_chn, self.curr_tile],self.A))
            #            self.curr_tile += 1
            #            if self.curr_tile == 4:
            #                self.curr_tile = 0
            #                self.ofmap_transformed[:,:,self.curr_chn] += self.bias[self.curr_chn] # add bias
            #                self.curr_chn += 1
            #            if self.curr_chn == 8:
            #                print ("reference shape: ", self.reference.shape)
            #                print ("ofmap shape: ", self.ofmap.shape)

            # FOR LOOPS USED B/C NOT COUNTING OFF CHIP PROCESSING IN PERFORMANCE STATISTICS (will unroll loops in on chip processing)
            #            for k in range(8):
            #                self.ofmap_transformed[:,:,k] += self.bias[k] # add bias
            #                for t in range(self.num_tiles):
            #                    x_idx = (t // 2)*2
            #                    y_idx = (t % 2)*2
            #                    self.ofmap_transformed[x_idx:x_idx+2,y_idx:y_idx+2,k] += np.dot(self.A_T,np.dot(self.ofmap[:,:,k,t],self.A))
            #            self.finish_signal_chn.push(True)
            if np.all(self.ofmap == self.reference):
                raise Finish("Success")
            else:
                print("ofmap: ")
                print(self.ofmap)
                print("reference: ")
                print(self.reference)
                print("difference: ")
                print(self.ofmap - self.reference)
                raise Finish("Validation Failed")

        else:
            #print ("output deser curr_tile, fmap_idx: ", self.curr_tile, self.fmap_idx)
            out_sets = self.arr_x // self.chn_per_word  # 2
            fmap_per_iteration = 4  # ofmap size, parametrize .. TODO

            if self.arch_output_chn.valid():
                data = [e for e in self.arch_output_chn.pop()]

                x_idx = (self.curr_tile // 2) * 2
                y_idx = (self.curr_tile % 2) * 2
                x = (self.fmap_idx % 2) + x_idx
                y = self.fmap_idx // 2 + y_idx

                #                    self.ofmap_transformed[x_idx:x_idx+2,y_idx:y_idx+2,k] += np.dot(self.A_T,np.dot(self.ofmap[:,:,k,t],self.A))

                if self.curr_set < out_sets:
                    cmin = self.curr_set * self.chn_per_word
                    cmax = cmin + self.chn_per_word
                    for c in range(cmin, cmax):
                        self.ofmap[x, y, c] = data[c - cmin]
                self.curr_set += 1

                if self.curr_set == out_sets:
                    self.curr_set = 0
                    #self.fmap_idx += 1
                    self.curr_tile += 1
                if self.curr_tile == 4:
                    self.fmap_idx += 1
                    self.curr_tile = 0
                if self.fmap_idx == fmap_per_iteration:
                    self.fmap_idx = 0
                    self.curr_tile = 0
                    # self.ofmap = self.ofmap//(128*128)
                    self.pass_done.wr(True)
예제 #9
0
class PostTransform(Module):
    def instantiate(self, locx, locy, bias_chn, ofmap_in_chn, ofmap_out_chn): #ofmap_in
        self.locx = locx
        self.locy = locy
        self.bias_chn = bias_chn
        self.ofmap_in_chn = ofmap_in_chn
        self.ofmap_out_chn = ofmap_out_chn
        self.transform_done = Reg(False)
        
        self.stat_type = 'aggregate'
        self.raw_stats = {'post_tr_alu_comp' : 0, 'post_tr_rf_rd' : 0, 'post_tr_ifmap_rf_wr' : 0}
        
    def configure(self):
        self.bias = 0
        self.iteration = 0
        self.y00 = None
        self.y01 = None
        self.y10 = None
        self.y11 = None
        self.transform_done.wr(False)
        self.bias_read = False
        
# Explanation of algorithm: transform ofmap M into y, performing inverse Winograd transform y = A_T*M*A
#    M = [M00 M01 M02 M03
#         M10 M11 M12 M13
#         M20 M21 M22 M23
#         M30 M31 M32 M33]
#
#    A_T = [1  1  1  0
#           0  1 -1 -1]
#
#    A = [1  0
#         1  1
#         1 -1
#         0 -1]
#
# Performing this transform yields a 2x2 output for a given 4x4 input:
#
#    y = [y00 y01
#         y10 y11]
#    ... such that:
#    y00 = M00+M01+M02+M10+M11+M12+M20+M21+M22
#    y01 = M01-M02-M03+M11-M12-M13+M21-M22-M23
#    y10 = M10+M11+M12-M20-M21-M22-M30-M31-M32
#    y11 = M11-M12-M13-M21+M22+M23-M31+M32+M33   

    def tick(self):
        if self.transform_done.rd():
            return
        if self.bias_chn.valid(): # should only ever be valid once
            self.bias = self.bias_chn.pop()
            self.bias_read = True
            self.y00 = self.bias
            self.y01 = self.bias
            self.y10 = self.bias
            self.y11 = self.bias
            self.raw_stats['post_tr_alu_comp'] += 4
            self.raw_stats['post_tr_ifmap_rf_wr'] += 4     
        elif self.ofmap_in_chn.valid() and self.ofmap_out_chn.vacancy():
            m = (self.ofmap_in_chn.pop())//(128) # right shift by 7 bits
            self.raw_stats['post_tr_alu_comp'] += 1
            #print("post tr -- iteration ", self.iteration)
            if (self.iteration == 0):    # get M_00
                self.y00 += m
                self.raw_stats['post_tr_alu_comp'] += 1
                self.raw_stats['post_tr_rf_rd'] += 1
                self.raw_stats['post_tr_ifmap_rf_wr'] += 1
                self.iteration += 1
            elif (self.iteration == 1):  # get M_01
                self.y00 += m
                self.y01 += m
                self.raw_stats['post_tr_alu_comp'] += 2
                self.raw_stats['post_tr_rf_rd'] += 2
                self.raw_stats['post_tr_ifmap_rf_wr'] += 2
                self.iteration += 1
            elif (self.iteration == 2):  # get M_02     
                self.y00 += m
                self.y01 -= m
                self.raw_stats['post_tr_alu_comp'] += 2
                self.raw_stats['post_tr_rf_rd'] += 2
                self.raw_stats['post_tr_ifmap_rf_wr'] += 2
                self.iteration += 1
            elif (self.iteration == 3):  # get M_03
                self.y01 -= m
                self.raw_stats['post_tr_alu_comp'] += 1
                self.raw_stats['post_tr_rf_rd'] += 1
                self.raw_stats['post_tr_ifmap_rf_wr'] += 1
                self.iteration += 1
            elif (self.iteration == 4):  # get M_10     
                self.y00 += m
                self.y10 += m
                self.raw_stats['post_tr_alu_comp'] += 2
                self.raw_stats['post_tr_rf_rd'] += 2
                self.raw_stats['post_tr_ifmap_rf_wr'] += 2
                self.iteration += 1
            elif (self.iteration == 5):  # get M_11     
                self.y00 += m
                self.y01 += m
                self.y10 += m
                self.y11 += m
                self.raw_stats['post_tr_alu_comp'] += 4
                self.raw_stats['post_tr_rf_rd'] += 4
                self.raw_stats['post_tr_ifmap_rf_wr'] += 4
                self.iteration += 1
            elif (self.iteration == 6):  # get M_12
                self.y00 += m
                self.y01 -= m
                self.y10 += m
                self.y11 -= m
                self.raw_stats['post_tr_alu_comp'] += 4
                self.raw_stats['post_tr_rf_rd'] += 4
                self.raw_stats['post_tr_ifmap_rf_wr'] += 4
                self.iteration += 1
            elif (self.iteration == 7):  # get M_13
                self.y01 -= m
                self.y11 -= m
                self.raw_stats['post_tr_alu_comp'] += 2
                self.raw_stats['post_tr_rf_rd'] += 2
                self.raw_stats['post_tr_ifmap_rf_wr'] += 2
                self.iteration += 1
            elif (self.iteration == 8):  # get M_20
                self.y00 += m
                self.y10 -= m
                self.raw_stats['post_tr_alu_comp'] += 2
                self.raw_stats['post_tr_rf_rd'] += 2
                self.raw_stats['post_tr_ifmap_rf_wr'] += 2
                self.iteration += 1
            elif (self.iteration == 9):  # get M_21     
                self.y00 += m
                self.y01 += m
                self.y10 -= m
                self.y11 -= m
                self.raw_stats['post_tr_alu_comp'] += 4
                self.raw_stats['post_tr_rf_rd'] += 4
                self.raw_stats['post_tr_ifmap_rf_wr'] += 4
                self.iteration += 1
            elif (self.iteration == 10 and self.bias_read == True): # get M_22       
                self.y00 += m
                self.y01 -= m
                self.y10 -= m
                self.y11 += m
                self.raw_stats['post_tr_alu_comp'] += 4
                self.raw_stats['post_tr_rf_rd'] += 4
                self.raw_stats['post_tr_ifmap_rf_wr'] += 4
                self.iteration += 1
                #print("post tr pushing y00: ", self.y00, self.bias)
                self.ofmap_out_chn.push(self.y00) # y00 done
                self.raw_stats['post_tr_ifmap_rf_wr'] -= 1 # send y00 immediately w/o writing to rf
            elif (self.iteration == 11): # get M_23
                self.y01 -= m
                self.y11 += m
                self.raw_stats['post_tr_alu_comp'] += 2
                self.raw_stats['post_tr_rf_rd'] += 2
                self.raw_stats['post_tr_ifmap_rf_wr'] += 2
                self.iteration += 1
                #print("post tr pushing y01: ", self.y01, self.bias)
                self.ofmap_out_chn.push(self.y01) # y01 done
                self.raw_stats['post_tr_ifmap_rf_wr'] -= 1 # send y01 immediately w/o writing to rf
            elif (self.iteration == 12): # get M_30     
                self.y10 -= m
                self.raw_stats['post_tr_alu_comp'] += 1
                self.raw_stats['post_tr_rf_rd'] += 1
                self.raw_stats['post_tr_ifmap_rf_wr'] += 1
                self.iteration += 1
            elif (self.iteration == 13): # get M_31
                self.y10 -= m
                self.y11 -= m
                self.raw_stats['post_tr_alu_comp'] += 2
                self.raw_stats['post_tr_rf_rd'] += 2
                self.raw_stats['post_tr_ifmap_rf_wr'] += 2
                self.iteration += 1
            elif (self.iteration == 14): # get M_32     
                self.y10 -= m
                self.y11 += m
                self.raw_stats['post_tr_alu_comp'] += 2
                self.raw_stats['post_tr_rf_rd'] += 2
                self.raw_stats['post_tr_ifmap_rf_wr'] += 2
                self.iteration += 1
                #print("post tr pushing y10: ", self.y10, self.bias)
                self.ofmap_out_chn.push(self.y10) # y10 done
                self.raw_stats['post_tr_ifmap_rf_wr'] -= 1 # send y10 immediately w/o writing to rf
            elif (self.iteration == 15): # get M_33
                self.y11 += m
                self.raw_stats['post_tr_alu_comp'] += 1
                self.raw_stats['post_tr_rf_rd'] += 1
                self.raw_stats['post_tr_ifmap_rf_wr'] += 1
                self.iteration += 1
                #print("post tr pushing y11: ", self.y11, self.bias)
                self.ofmap_out_chn.push(self.y11) # y11 done
                self.raw_stats['post_tr_ifmap_rf_wr'] -= 1 # send y11 immediately w/o writing to rf
            #self.iteration += 1
        if self.iteration == 16:
            self.transform_done.wr(True)
예제 #10
0
class InputSerializer(Module):
    def instantiate(self, arch_input_chn, arr_x, arr_y, chn_per_word):
        # PE static configuration (immutable)
        self.arr_x = arr_x
        self.arr_y = arr_y
        self.chn_per_word = chn_per_word

        self.arch_input_chn = arch_input_chn

        self.ifmap = None
        self.weights = None
        self.bias = None

        self.image_size = (0, 0)
        self.filter_size = (0, 0)

        self.ifmap_psum_done = True
        self.pass_done = Reg(False)

        # State Counters
        self.curr_set = 0
        self.curr_filter = 0
        self.iteration = 0
        self.fmap_idx = 0

    def configure(self, ifmap, weights, bias, image_size, filter_size):
        self.ifmap = ifmap
        self.weights = weights
        self.bias = bias

        self.image_size = image_size
        self.filter_size = filter_size

        self.ifmap_psum_done = False
        self.pass_done.wr(False)

    def tick(self):
        if self.pass_done.rd():
            return

        in_sets = self.arr_y // self.chn_per_word
        out_sets = self.arr_x // self.chn_per_word
        fmap_per_iteration = self.image_size[0] * self.image_size[1]
        num_iteration = self.filter_size[0] * self.filter_size[1]

        if not self.ifmap_psum_done:
            if self.arch_input_chn.vacancy():
                # print "input append"

                x = self.fmap_idx % self.image_size[0]
                y = self.fmap_idx // self.image_size[0]

                if self.curr_set < in_sets:
                    cmin = self.curr_set * self.chn_per_word
                    cmax = cmin + self.chn_per_word
                    # Write ifmap to glb
                    data = np.array(
                        [self.ifmap[x, y, c] for c in range(cmin, cmax)])
                else:
                    cmin = (self.curr_set - in_sets) * self.chn_per_word
                    cmax = cmin + self.chn_per_word
                    # Write bias to glb
                    data = np.array([self.bias[c] for c in range(cmin, cmax)])
                self.arch_input_chn.push(data)
                self.curr_set += 1

                if self.curr_set == (in_sets + out_sets):
                    self.curr_set = 0
                    self.fmap_idx += 1
                if self.fmap_idx == fmap_per_iteration:
                    self.fmap_idx = 0
                    self.ifmap_psum_done = True
                    # print "---- Wrote inputs and biases ----"
        else:
            f_x = self.iteration % self.filter_size[0]
            f_y = self.iteration // self.filter_size[0]

            # Push filters to PE columns. (PE is responsible for pop)
            if self.arch_input_chn.vacancy(
            ) and self.iteration < num_iteration:
                cmin = self.curr_set * self.chn_per_word
                cmax = cmin + self.chn_per_word
                data = np.array([self.weights[f_x, f_y, c, self.curr_filter] \
                        for c in range(cmin, cmax) ])

                self.arch_input_chn.push(data)
                self.curr_set += 1
                if self.curr_set == in_sets:
                    self.curr_set = 0
                    self.curr_filter += 1
                if self.curr_filter == self.arr_x:
                    self.curr_filter = 0
                    # print "---- Wrote weights iteration: %d ----" % self.iteration
                    self.iteration += 1
                if self.iteration == num_iteration:
                    # print "---- Wrote all weights ----"
                    self.pass_done.wr(True)
예제 #11
0
class OutputDeserializer(Module):
    def instantiate(self, arch_output_chn, arr_x, arr_y, chn_per_word):
        # PE static configuration (immutable)
        self.arr_x = arr_x
        self.arr_y = arr_y
        self.chn_per_word = chn_per_word

        self.arch_output_chn = arch_output_chn

        self.ofmap = None
        self.reference = None

        self.image_size = (0, 0)

        self.curr_set = 0
        self.fmap_idx = 0

        self.pass_done = Reg(False)

    def configure(self, ofmap, reference, image_size, filter_size):
        self.ofmap = ofmap
        self.reference = reference

        self.image_size = image_size
        self.filter_size = filter_size

        self.curr_set = 0
        self.fmap_idx = 0

        self.pass_done.wr(False)

    def tick(self):
        if self.pass_done.rd():
            return

        out_sets = self.arr_x // self.chn_per_word
        #fmap_per_iteration = self.image_size[0]*self.image_size[1]
        fmap_per_iteration = (self.image_size[0] - self.filter_size[0] + 1) * (
            self.image_size[1] - self.filter_size[1] + 1)

        if self.arch_output_chn.valid():
            data = [e for e in self.arch_output_chn.pop()]

            x = self.fmap_idx % self.image_size[0]
            y = self.fmap_idx // self.image_size[0]

            x = self.fmap_idx % (self.image_size[0] - self.filter_size[0] + 1)
            y = self.fmap_idx // (self.image_size[0] - self.filter_size[0] + 1)

            if self.curr_set < out_sets:
                cmin = self.curr_set * self.chn_per_word
                cmax = cmin + self.chn_per_word
                for c in range(cmin, cmax):
                    self.ofmap[x, y, c] = data[c - cmin]
            self.curr_set += 1

            if self.curr_set == out_sets:
                self.curr_set = 0
                self.fmap_idx += 1
            if self.fmap_idx == fmap_per_iteration:
                self.fmap_idx = 0
                self.pass_done.wr(True)
                if np.all(self.ofmap == self.reference):
                    raise Finish("Success")
                else:
                    print(self.ofmap)
                    print(self.reference)
                    print(self.ofmap - self.reference)
                    raise Finish("Validation Failed")
class PreTransformIFMap(Module):
    def instantiate(self, locx, locy, ifmap_in_chn, ifmap_out_chn):
        self.locx = locx
        self.locy = locy
        self.ifmap_in_chn = ifmap_in_chn
        self.ifmap_out_chn = ifmap_out_chn
        self.transform_done = Reg(False)

        self.stat_type = 'aggregate'
        self.raw_stats = {
            'pre_tr_ifmap_alu_comp': 0,
            'pre_tr_ifmap_rf_rd': 0,
            'pre_tr_ifmap_rf_wr': 0
        }

    def configure(self):
        self.iteration = 0
        self.push_ctr = 0
        self.V = np.zeros([4, 4]).astype(np.int64)
        self.raw_stats['pre_tr_ifmap_rf_wr'] += 16  # write zeros into rf
        self.transform_done.wr(False)

# Explanation of algorithm: transform ifmap D into V, performing Winograd transform v = B_T*D*M
#    D = [D00 D01 D02 D03
#         D10 D11 D12 D13
#         D20 D21 D22 D23
#         D30 D31 D32 D33]
#
#    B_T = [1  0 -1  0
#           0  1  1  0
#           0 -1  1  0
#           0  1  0 -1]
#
#    B = [ 1   0  0  0
#          0   1 -1  1
#         -1   1  1  0
#          0   0  0 -1]
#
# Performing this transform yields a 4x4 output for a given 4x4 input:
#
#    V = [v00 v01 v02 v03
#         v10 v11 v12 v13
#         v20 v21 v22 v23
#         v30 v31 v32 v33]
#
#    ... such that:
#    v00 = (D00 - D02 - D20 + D22);
#    v01 = (D01 + D02 - D21 - D22);
#    v02 = (D02 - D01 + D21 - D22);
#    v03 = (D01 - D03 - D21 + D23);
#    v10 = (D10 - D12 + D20 - D22);
#    v11 = (D11 + D12 + D21 + D22);
#    v12 = (D12 - D11 - D21 + D22);
#    v13 = (D11 - D13 + D21 - D23);
#    v20 = (D12 - D10 + D20 - D22);
#    v21 = (D21 - D12 - D11 + D22);
#    v22 = (D11 - D12 - D21 + D22);
#    v23 = (D13 - D11 + D21 - D23);
#    v30 = (D10 - D12 - D30 + D32);
#    v31 = (D11 + D12 - D31 - D32);
#    v32 = (D12 - D11 + D31 - D32);
#    v33 = (D11 - D13 - D31 + D33);

    def tick(self):
        if self.transform_done.rd():
            return
        if self.ifmap_in_chn.valid() and self.ifmap_out_chn.vacancy():
            d = (self.ifmap_in_chn.pop())
            #print ("pre transform ifmap pop - locx, locy, data: ",self.locx,self.locy,d)
            if (self.iteration == 0):  # get D_00
                self.V[0][0] += d
                self.raw_stats['pre_tr_ifmap_alu_comp'] += 1
                self.raw_stats['pre_tr_ifmap_rf_rd'] += 1
                self.raw_stats['pre_tr_ifmap_rf_wr'] += 1
                self.iteration += 1
            elif (self.iteration == 1):  # get D_01
                self.V[0][1] += d
                self.V[0][2] -= d
                self.V[0][3] += d
                self.raw_stats['pre_tr_ifmap_alu_comp'] += 3
                self.raw_stats['pre_tr_ifmap_rf_rd'] += 3
                self.raw_stats['pre_tr_ifmap_rf_wr'] += 3
                self.iteration += 1
            elif (self.iteration == 2):  # get D_02
                self.V[0][0] -= d
                self.V[0][1] += d
                self.V[0][2] += d
                self.raw_stats['pre_tr_ifmap_alu_comp'] += 3
                self.raw_stats['pre_tr_ifmap_rf_rd'] += 3
                self.raw_stats['pre_tr_ifmap_rf_wr'] += 3
                self.iteration += 1
            elif (self.iteration == 3):  # get D_03
                self.V[0][3] -= d
                self.raw_stats['pre_tr_ifmap_alu_comp'] += 1
                self.raw_stats['pre_tr_ifmap_rf_rd'] += 1
                self.raw_stats['pre_tr_ifmap_rf_wr'] += 1
                self.iteration += 1
            elif (self.iteration == 4):  # get D_10
                self.V[1][0] += d
                self.V[2][0] -= d
                self.V[3][0] += d
                self.raw_stats['pre_tr_ifmap_alu_comp'] += 3
                self.raw_stats['pre_tr_ifmap_rf_rd'] += 3
                self.raw_stats['pre_tr_ifmap_rf_wr'] += 3
                self.iteration += 1
            elif (self.iteration == 5):  # get D_11
                self.V[1][1] += d
                self.V[1][2] -= d
                self.V[1][3] += d
                self.V[2][1] -= d
                self.V[2][2] += d
                self.V[2][3] -= d
                self.V[3][1] += d
                self.V[3][2] -= d
                self.V[3][3] += d
                self.raw_stats['pre_tr_ifmap_alu_comp'] += 9
                self.raw_stats['pre_tr_ifmap_rf_rd'] += 9
                self.raw_stats['pre_tr_ifmap_rf_wr'] += 9
                self.iteration += 1
            elif (self.iteration == 6):  # get D_12
                self.V[1][0] -= d
                self.V[1][1] += d
                self.V[1][2] += d
                self.V[2][0] += d
                self.V[2][1] -= d
                self.V[2][2] -= d
                self.V[3][0] -= d
                self.V[3][1] += d
                self.V[3][2] += d
                self.raw_stats['pre_tr_ifmap_alu_comp'] += 9
                self.raw_stats['pre_tr_ifmap_rf_rd'] += 9
                self.raw_stats['pre_tr_ifmap_rf_wr'] += 9
                self.iteration += 1
            elif (self.iteration == 7):  # get D_13
                self.V[1][3] -= d
                self.V[2][3] += d
                self.V[3][3] -= d
                self.raw_stats['pre_tr_ifmap_alu_comp'] += 3
                self.raw_stats['pre_tr_ifmap_rf_rd'] += 3
                self.raw_stats['pre_tr_ifmap_rf_wr'] += 3
                self.iteration += 1
            elif (self.iteration == 8):  # get D_20
                self.V[0][0] -= d
                self.V[1][0] += d
                self.V[2][0] += d
                self.raw_stats['pre_tr_ifmap_alu_comp'] += 3
                self.raw_stats['pre_tr_ifmap_rf_rd'] += 3
                self.raw_stats['pre_tr_ifmap_rf_wr'] += 3
                self.iteration += 1
            elif (self.iteration == 9):  # get D_21
                self.V[0][1] -= d
                self.V[0][2] += d
                self.V[0][3] -= d
                self.V[1][1] += d
                self.V[1][2] -= d
                self.V[1][3] += d
                self.V[2][1] += d
                self.V[2][2] -= d
                self.V[2][3] += d
                self.raw_stats['pre_tr_ifmap_alu_comp'] += 9
                self.raw_stats['pre_tr_ifmap_rf_rd'] += 9
                self.raw_stats['pre_tr_ifmap_rf_wr'] += 9
                self.iteration += 1
            elif (self.iteration == 10
                  ):  # get D_22 & start pushing transformed data out
                self.V[0][0] += d
                self.V[0][1] -= d
                self.V[0][2] -= d
                self.V[1][0] -= d
                self.V[1][1] += d
                self.V[1][2] += d
                self.V[2][0] -= d
                self.V[2][1] += d
                self.V[2][2] += d
                self.raw_stats['pre_tr_ifmap_alu_comp'] += 9
                self.raw_stats['pre_tr_ifmap_rf_rd'] += 9
                self.raw_stats['pre_tr_ifmap_rf_wr'] += 9
                self.ifmap_out_chn.push(
                    self.V[self.push_ctr // 4][self.push_ctr % 4])  # push v00
                self.raw_stats[
                    'pre_tr_ifmap_rf_rd'] -= 1  # push v00 immediately w/o writing to rf
                #print ("pre transform ifmap - locx, locy, iteration, transformed ifmap: ", \
                #       self.locx, self.locy, self.iteration, self.V[self.push_ctr // 4][self.push_ctr % 4])
                self.push_ctr += 1
                self.iteration += 1
            elif (self.iteration == 11):  # get D_23
                self.V[0][3] += d
                self.V[1][3] -= d
                self.V[2][3] -= d
                self.raw_stats['pre_tr_ifmap_alu_comp'] += 3
                self.raw_stats['pre_tr_ifmap_rf_rd'] += 3
                self.raw_stats['pre_tr_ifmap_rf_wr'] += 3
                self.ifmap_out_chn.push(
                    self.V[self.push_ctr // 4][self.push_ctr % 4])  # push v01
                self.raw_stats['pre_tr_ifmap_rf_rd'] += 1  # read v01 from rf
                #print ("pre transform ifmap - locx, locy, iteration, transformed ifmap: ", \
                #       self.locx, self.locy, self.iteration, self.V[self.push_ctr // 4][self.push_ctr % 4])
                self.push_ctr += 1
                self.iteration += 1
            elif (self.iteration == 12):  # get D_30
                self.V[3][0] -= d
                self.raw_stats['pre_tr_ifmap_alu_comp'] += 1
                self.raw_stats['pre_tr_ifmap_rf_rd'] += 1
                self.raw_stats['pre_tr_ifmap_rf_wr'] += 1
                self.ifmap_out_chn.push(
                    self.V[self.push_ctr // 4][self.push_ctr % 4])  # push v02
                self.raw_stats['pre_tr_ifmap_rf_rd'] += 1  # read v02 from rf
                #print ("pre transform ifmap - locx, locy, iteration, transformed ifmap: ", \
                #       self.locx, self.locy, self.iteration, self.V[self.push_ctr // 4][self.push_ctr % 4])
                self.push_ctr += 1
                self.iteration += 1
            elif (self.iteration == 13):  # get D_31
                self.V[3][1] -= d
                self.V[3][2] += d
                self.V[3][3] -= d
                self.raw_stats['pre_tr_ifmap_alu_comp'] += 3
                self.raw_stats['pre_tr_ifmap_rf_rd'] += 3
                self.raw_stats['pre_tr_ifmap_rf_wr'] += 3
                self.ifmap_out_chn.push(
                    self.V[self.push_ctr // 4][self.push_ctr % 4])  # push v03
                self.raw_stats['pre_tr_ifmap_rf_rd'] += 1  # read v03 from rf
                #print ("pre transform ifmap - locx, locy, iteration, transformed ifmap: ", \
                #       self.locx, self.locy, self.iteration, self.V[self.push_ctr // 4][self.push_ctr % 4])
                self.push_ctr += 1
                self.iteration += 1
            elif (self.iteration == 14):  # get D_32
                self.V[3][0] += d
                self.V[3][1] -= d
                self.V[3][2] -= d
                self.raw_stats['pre_tr_ifmap_alu_comp'] += 3
                self.raw_stats['pre_tr_ifmap_rf_rd'] += 3
                self.raw_stats['pre_tr_ifmap_rf_wr'] += 3
                self.ifmap_out_chn.push(
                    self.V[self.push_ctr // 4][self.push_ctr % 4])  # push v10
                self.raw_stats['pre_tr_ifmap_rf_wr'] += 1  # read v10 from rf
                #print ("pre transform ifmap - locx, locy, iteration, transformed ifmap: ", \
                #       self.locx, self.locy, self.iteration, self.V[self.push_ctr // 4][self.push_ctr % 4])
                self.push_ctr += 1
                self.iteration += 1
            elif (self.iteration == 15):  # get D_33
                self.V[3][3] += d
                self.raw_stats['pre_tr_ifmap_alu_comp'] += 1
                self.raw_stats['pre_tr_ifmap_rf_rd'] += 1
                self.raw_stats['pre_tr_ifmap_rf_wr'] += 1
                self.ifmap_out_chn.push(
                    self.V[self.push_ctr // 4][self.push_ctr % 4])  # push v11
                self.raw_stats['pre_tr_ifmap_rf_wr'] += 1  # read v11 from rf
                #print ("pre transform ifmap - locx, locy, iteration, transformed ifmap: ", \
                #       self.locx, self.locy, self.iteration, self.V[self.push_ctr // 4][self.push_ctr % 4])
                self.push_ctr += 1
                self.iteration += 1
        elif self.iteration == 16 and self.ifmap_out_chn.vacancy(
        ):  # done computing transform, push remaining V's sequentially
            self.ifmap_out_chn.push(self.V[self.push_ctr // 4][self.push_ctr %
                                                               4])
            self.raw_stats['pre_tr_ifmap_rf_rd'] += 1  # read vXX from rf
            #print ("pre transform ifmap - locx, locy, iteration, transformed ifmap: ", \
            #       self.locx, self.locy, self.iteration, self.V[self.push_ctr // 4][self.push_ctr % 4])
            self.push_ctr += 1
            if self.push_ctr == 16:  # all 16 transformed ifmap values have been pushed
                self.transform_done.wr(True)
예제 #13
0
class OutputDeserializer(Module):
    def instantiate(self, arch_output_chn, psum_chn, arr_x, arr_y,
                    chn_per_word):
        # PE static configuration (immutable)
        self.arr_x = arr_x
        self.arr_y = arr_y
        self.chn_per_word = chn_per_word

        self.arch_output_chn = arch_output_chn
        self.psum_chn = psum_chn

        self.ofmap = None
        self.reference = None

        self.image_size = (0, 0)

        self.curr_set = 0
        self.fmap_idx = 0

        self.pass_done = Reg(False)

    def configure(self, ofmap, reference, image_size, curr_pass):
        if (curr_pass == 0):  # so that ofmap doesnt get rewritten with zeros
            self.ofmap = ofmap

        self.reference = reference

        self.image_size = image_size

        self.curr_set = 0
        self.fmap_idx = 0
        self.curr_pass = curr_pass
        self.num_passes = 4

        self.pass_done.wr(False)

    def tick(self):
        if self.pass_done.rd():
            return

        out_sets = self.arr_x // self.chn_per_word
        fmap_per_iteration = self.image_size[0] * self.image_size[1]

        if self.arch_output_chn.valid() and (self.psum_chn.vacancy()
                                             or self.curr_pass % 2 == 1):
            data = [e for e in self.arch_output_chn.pop()]
            if ((self.curr_pass %
                 2) == 0):  # push ofmap psum to serializer on pass 0 and 2
                self.psum_chn.push(data)

            x = self.fmap_idx % self.image_size[0]
            y = self.fmap_idx // self.image_size[0]

            if self.curr_set < out_sets:
                channel_offset = 0
                if (self.curr_pass > 1):
                    channel_offset = 8
                cmin = self.curr_set * self.chn_per_word + channel_offset
                cmax = cmin + self.chn_per_word
                for c in range(cmin, cmax):
                    self.ofmap[x, y, c] = data[c - cmin]

            self.curr_set += 1

            if self.curr_set == out_sets:
                self.curr_set = 0
                self.fmap_idx += 1
            if self.fmap_idx == fmap_per_iteration:
                self.fmap_idx = 0
                if (self.curr_pass == self.num_passes - 1):
                    self.pass_done.wr(True)
                    if np.all(self.ofmap == self.reference):
                        raise Finish("Success")
                    else:
                        print(self.ofmap)
                        print(self.reference)
                        print(self.ofmap - self.reference)
                        raise Finish("Validation Failed")
예제 #14
0
class InputSerializer(Module):
    def instantiate(self, arch_input_chn, arr_x, arr_y, chn_per_word):
        # PE static configuration (immutable)
        self.arr_x = arr_x
        self.arr_y = arr_y
        self.chn_per_word = chn_per_word

        self.arch_input_chn = arch_input_chn

        self.ifmap = None
        self.weights = None
        self.bias = None

        self.image_size = (0, 0)
        self.filter_size = (0, 0)

        self.ifmap_psum_done = True
        self.pass_done = Reg(False)

        # State Counters
        self.curr_set = 0
        self.curr_filter = 0
        self.iteration = 0
        self.fmap_idx = 0
        self.bias_idx = 0
        self.weight_idx = 0

    def configure(self, ifmap, weights, bias, image_size, filter_size):
        self.ifmap = ifmap
        self.weights = weights
        self.bias = bias

        self.image_size = image_size
        self.filter_size = filter_size

        self.bias_wr_done = False
        self.fmap_wr_done = False
        self.weight_wr_done = False
        self.pass_done.wr(False)
        self.send_ifmap = True  # used to interleave sending weights and ifmaps to chip

        self.bias_sets = 2

    def tick(self):
        if self.pass_done.rd():
            return


#        in_sets = self.arr_y//self.chn_per_word
#        out_sets = self.arr_x//self.chn_per_word
        fmap_per_iteration = self.image_size[0] * self.image_size[1]
        num_iteration = self.filter_size[0] * self.filter_size[1]
        weights_per_filter = self.filter_size[0] * self.filter_size[1]

        if self.arch_input_chn.vacancy() and not self.pass_done.rd():
            if not self.bias_wr_done:
                kmin = self.bias_idx * self.chn_per_word
                kmax = kmin + self.chn_per_word
                data = np.array([self.bias[k] for k in range(kmin, kmax)])
                self.bias_idx += 1
                #print ("input ser kmin,kmax,bias: ",kmin,kmax,data)
            elif (not self.fmap_wr_done) and self.send_ifmap:  # send ifmap
                # send 4 elements of ifmap
                x = self.fmap_idx % self.image_size[0]
                y = self.fmap_idx // self.image_size[0]
                cmin = self.curr_set * self.chn_per_word  # 0
                cmax = cmin + self.chn_per_word  # 4
                data = np.array(
                    [self.ifmap[x, y, c] for c in range(cmin, cmax)])
                self.fmap_idx += 1
                #print ("input ser x,y,cmin,cmax,ifmaps: ",x,y,cmin,cmax,data)
                self.send_ifmap = False
            else:  # send weight
                # send 4 elements of weights (twice in succession)
                x = self.weight_idx % self.filter_size[0]
                y = self.weight_idx // self.filter_size[1]
                cmin = 0
                cmax = cmin + self.chn_per_word
                data = np.array([
                    self.weights[x, y, c, self.curr_filter]
                    for c in range(cmin, cmax)
                ])
                self.curr_filter += 1
                if (not self.fmap_wr_done):
                    self.send_ifmap = True
                #print ("input ser x,y,cmin,cmax,curr_filter,weights: ",x,y,cmin,cmax,self.curr_filter,data)
            self.arch_input_chn.push(data)
            if self.fmap_idx == fmap_per_iteration:
                self.fmap_wr_done = True
                self.fmap_idx = 0
            if self.curr_filter == self.arr_x:
                self.weight_idx += 1
                self.curr_filter = 0
            if self.weight_idx == weights_per_filter:
                self.weight_wr_done = True
                self.pass_done.wr(True)
            if self.bias_idx == self.bias_sets:  #2
                self.bias_wr_done = True
class PreTransformWeights(Module):
    def instantiate(self, locx, locy, weight_in_chn, weight_out_chn):
        self.locx = locx
        self.locy = locy
        self.weight_in_chn = weight_in_chn
        self.weight_out_chn = weight_out_chn
        self.transform_done = Reg(False)

        self.stat_type = 'aggregate'
        self.raw_stats = {
            'pre_tr_weights_alu_comp': 0,
            'pre_tr_weights_rf_rd': 0,
            'pre_tr_weights_rf_wr': 0
        }

    def configure(self):
        self.iteration = 0
        self.push_ctr = 0
        self.U = np.zeros([4, 4]).astype(np.int64)
        self.transform_done.wr(False)

# Explanation of algorithm: transform filter weights G into U, performing Winograd transform U = H*G*H_T
#    G = [G00 G01 G02
#         G10 G11 G12
#         G20 G21 G22]
#
#    H = [1    0    0
#         0.5  0.5  0.5
#         0.5 -0.5  0.5
#         0    0    1  ]
#
#    H_T = [1  0.5  0.5  0
#           0  0.5 -0.5  0
#           0  0.5  0.5  1]
#
# Performing this transform yields a 4x4 output for a given 4x4 input:
#
#    U = [u00 u01 u02 u03
#         u10 u11 u12 u13
#         u20 u21 u22 u23
#         u30 u31 u32 u33]
#    ... such that:
#   u00 = (G00)<<7;
#   u01 = (G00 + G01 + G02)<<6;
#   u02 = (G00 - G01 + G02)<<6;
#   u03 = (G02)<<7;
#   u10 = (G00 + G10 + G20)<<6;
#   u11 = (G00 + G01 + G02 + G10 + G11 + G12 + G20 + G21 + G22)<<5;
#   u12 = (G00 - G01 + G02 + G10 - G11 + G12 + G20 - G21 + G22)<<5;
#   u13 = (G02 + G12 + G22)<<6;
#   u20 = (G00 - G10 + G20)<<6;
#   u21 = (G00 + G01 + G02 - G10 - G11 - G12 + G20 + G21 + G22)<<5;
#   u22 = (G00 - G01 + G02 - G10 + G11 - G12 + G20 - G21 + G22)<<5;
#   u23 = (G02 - G12 + G22)<<6;
#   u30 = (G20)<<7;
#   u31 = (G20 + G21 + G22)<<6;
#   u32 = (G20 - G21 + G22)<<6;
#   u33 = (G22)<<7;

    def tick(self):
        if self.transform_done.rd():
            return
        if self.weight_in_chn.valid() and self.weight_out_chn.vacancy():
            g = (self.weight_in_chn.pop())
            #print("pre tr weight: locx, locy, receive weight: ", self.locx, self.locy, g)
            if (self.iteration == 0):  # get G_00
                self.U[0][0] += g
                self.U[0][1] += g
                self.U[0][2] += g
                self.U[1][0] += g
                self.U[1][1] += g
                self.U[1][2] += g
                self.U[2][0] += g
                self.U[2][1] += g
                self.U[2][2] += g
                self.U[0][0] = self.U[0][0] * 128  # left shift by 7
                self.raw_stats[
                    'pre_tr_weights_alu_comp'] += 10  #9 adds, 1 shift
                self.raw_stats['pre_tr_weights_rf_rd'] += 9
                self.raw_stats['pre_tr_weights_rf_wr'] += 9
                self.weight_out_chn.push(
                    self.U[self.push_ctr // 4][self.push_ctr % 4])  # send U00
                self.raw_stats[
                    'pre_tr_weights_rf_wr'] -= 1  # u00 sent immediately, not written back to rf
                #print ("pre transform weights - locx, locy, iteration, transformed weight: ", \
                #       self.locx, self.locy, self.iteration, self.U[self.push_ctr // 4][self.push_ctr % 4])
                self.push_ctr += 1
                self.iteration += 1
            elif (self.iteration == 1):  # get G_01
                self.U[0][1] += g
                self.U[0][2] -= g
                self.U[1][1] += g
                self.U[1][2] -= g
                self.U[2][1] += g
                self.U[2][2] -= g
                self.raw_stats['pre_tr_weights_alu_comp'] += 6
                self.raw_stats['pre_tr_weights_rf_rd'] += 6
                self.raw_stats['pre_tr_weights_rf_wr'] += 6
                self.iteration += 1
            elif (self.iteration == 2):  # get G_02
                self.U[0][1] += g
                self.U[0][2] += g
                self.U[0][3] += g
                self.U[1][1] += g
                self.U[1][2] += g
                self.U[1][3] += g
                self.U[2][1] += g
                self.U[2][2] += g
                self.U[2][3] += g
                self.U[0][1] = self.U[0][1] * 64  # left shift by 6
                self.U[0][2] = self.U[0][2] * 64  # left shift by 6
                self.U[0][3] = self.U[0][3] * 128  # left shift by 7
                self.raw_stats[
                    'pre_tr_weights_alu_comp'] += 12  #9 adds/subt, 3 shift
                self.raw_stats['pre_tr_weights_rf_rd'] += 9
                self.raw_stats['pre_tr_weights_rf_wr'] += 9
                self.weight_out_chn.push(
                    self.U[self.push_ctr // 4][self.push_ctr % 4])  # send U01
                self.raw_stats[
                    'pre_tr_weights_rf_wr'] -= 1  # u01 sent immediately, not written back to rf
                #print ("pre transform weights - locx, locy, iteration, transformed weight: ", \
                #       self.locx, self.locy, self.iteration, self.U[self.push_ctr // 4][self.push_ctr % 4])
                self.push_ctr += 1
                self.iteration += 1
            elif (self.iteration == 3):  # get G_10
                self.U[1][0] += g
                self.U[1][1] += g
                self.U[1][2] += g
                self.U[2][0] -= g
                self.U[2][1] -= g
                self.U[2][2] -= g
                self.raw_stats['pre_tr_weights_alu_comp'] += 6
                self.raw_stats['pre_tr_weights_rf_rd'] += 6
                self.raw_stats['pre_tr_weights_rf_wr'] += 6
                self.weight_out_chn.push(
                    self.U[self.push_ctr // 4][self.push_ctr % 4])  # send U02
                self.raw_stats['pre_tr_weights_rf_rd'] += 1  # read u02 from rf
                #print ("pre transform weights - locx, locy, iteration, transformed weight: ", \
                #       self.locx, self.locy, self.iteration, self.U[self.push_ctr // 4][self.push_ctr % 4])
                self.push_ctr += 1
                self.iteration += 1
            elif (self.iteration == 4):  # get G_11
                self.U[1][1] += g
                self.U[1][2] -= g
                self.U[2][1] -= g
                self.U[2][2] += g
                self.raw_stats['pre_tr_weights_alu_comp'] += 4
                self.raw_stats['pre_tr_weights_rf_rd'] += 4
                self.raw_stats['pre_tr_weights_rf_wr'] += 4
                self.weight_out_chn.push(
                    self.U[self.push_ctr // 4][self.push_ctr % 4])  # send U03
                self.raw_stats['pre_tr_weights_rf_rd'] += 1  # read u03 from rf
                #print ("pre transform weights - locx, locy, iteration, transformed weight: ", \
                #       self.locx, self.locy, self.iteration, self.U[self.push_ctr // 4][self.push_ctr % 4])
                self.push_ctr += 1
                self.iteration += 1
            elif (self.iteration == 5):  # get G_12
                self.U[1][1] += g
                self.U[1][2] += g
                self.U[1][3] += g
                self.U[2][1] -= g
                self.U[2][2] -= g
                self.U[2][3] -= g
                self.raw_stats['pre_tr_weights_alu_comp'] += 6
                self.raw_stats['pre_tr_weights_rf_rd'] += 6
                self.raw_stats['pre_tr_weights_rf_wr'] += 6
                # no new completed weights this round
                self.iteration += 1
            elif (self.iteration == 6):  # get G_20
                self.U[1][0] += g
                self.U[1][1] += g
                self.U[1][2] += g
                self.U[2][0] += g
                self.U[2][1] += g
                self.U[2][2] += g
                self.U[3][0] += g
                self.U[3][1] += g
                self.U[3][2] += g
                self.U[1][0] = self.U[1][0] * 64  # left shift 6
                self.U[2][0] = self.U[2][0] * 64  # left shift 6
                self.U[3][0] = self.U[3][0] * 128  # left shift 7
                self.raw_stats[
                    'pre_tr_weights_alu_comp'] += 12  # 9 add, 3 shift ops
                self.raw_stats['pre_tr_weights_rf_rd'] += 9
                self.raw_stats['pre_tr_weights_rf_wr'] += 9
                self.weight_out_chn.push(
                    self.U[self.push_ctr // 4][self.push_ctr % 4])  # send U10
                self.raw_stats[
                    'pre_tr_weights_rf_wr'] -= 1  # send u10 immediately, w/o writing to rf
                #print ("pre transform weights - locx, locy, iteration, transformed weight: ", \
                #       self.locx, self.locy, self.iteration, self.U[self.push_ctr // 4][self.push_ctr % 4])
                self.push_ctr += 1
                self.iteration += 1
            elif (self.iteration == 7):  # get G_21
                self.U[1][1] += g
                self.U[1][2] -= g
                self.U[2][1] += g
                self.U[2][2] -= g
                self.U[3][1] += g
                self.U[3][2] -= g
                self.raw_stats[
                    'pre_tr_weights_alu_comp'] += 6  # 9 add, 3 shift ops
                self.raw_stats['pre_tr_weights_rf_rd'] += 6
                self.raw_stats['pre_tr_weights_rf_wr'] += 6
                # no new completed weights this round
                self.iteration += 1
            elif (self.iteration == 8):  # get G_22
                #print ("iteration 8")
                self.U[1][1] += g
                self.U[1][2] += g
                self.U[1][3] += g
                self.U[2][1] += g
                self.U[2][2] += g
                self.U[2][3] += g
                self.U[3][1] += g
                self.U[3][2] += g
                self.U[3][3] += g
                self.raw_stats['pre_tr_weights_alu_comp'] += 9  # 9 add
                self.raw_stats['pre_tr_weights_rf_rd'] += 9
                self.raw_stats['pre_tr_weights_rf_wr'] += 9
                self.iteration += 1
        elif (self.iteration == 9 and self.weight_out_chn.vacancy()):
            #print ("iteration 9")
            self.U[1][1] = self.U[1][1] * 32  # left shift by 5
            self.U[1][2] = self.U[1][2] * 32  # left shift by 5
            self.U[1][3] = self.U[1][3] * 64
            self.U[2][1] = self.U[2][1] * 32
            self.U[2][2] = self.U[2][2] * 32
            self.U[2][3] = self.U[2][3] * 64
            self.U[3][1] = self.U[3][1] * 64
            self.U[3][2] = self.U[3][2] * 64
            self.U[3][3] = self.U[3][3] * 128
            self.raw_stats['pre_tr_weights_alu_comp'] += 9  # 9 shift ops
            self.raw_stats['pre_tr_weights_rf_rd'] += 9
            self.raw_stats['pre_tr_weights_rf_wr'] += 9
            self.weight_out_chn.push(self.U[self.push_ctr // 4][self.push_ctr %
                                                                4])  # send U11
            self.raw_stats[
                'pre_tr_weights_rf_wr'] -= 1  # send u11 immediately w/o writing to rf
            #            print ("pre transform weights - locx, locy, iteration, transformed weight: ", \
            #                   self.locx, self.locy, self.iteration, self.U[self.push_ctr // 4][self.push_ctr % 4])
            self.push_ctr += 1
            self.iteration += 1
        elif self.iteration == 10 and self.weight_out_chn.vacancy(
        ):  # finish pushing transformed weights
            self.weight_out_chn.push(self.U[self.push_ctr // 4][self.push_ctr %
                                                                4])
            self.raw_stats['pre_tr_weights_rf_rd'] += 1  # read uXX from rf
            #print ("pre transform weights - locx, locy, iteration, transformed weight: ", \
            #       self.locx, self.locy, self.iteration, self.U[self.push_ctr // 4][self.push_ctr % 4])
            self.push_ctr += 1
            if self.push_ctr == 16:  # all 16 transformed weight values have been pushed
                self.transform_done.wr(True)
예제 #16
0
class OutputDeserializer(Module):
    def instantiate(self, arch_output_chn, arr_y, block_size, num_nonzero):
        # PE static configuration (immutable)
        self.arr_y = arr_y
        self.block_size = block_size
        self.num_nonzero = num_nonzero

        self.arch_output_chn = arch_output_chn

        self.ofmap = None
        self.reference = None

        self.image_size = (0, 0)

        self.curr_set = 0
        self.fmap_idx = 0

        self.pass_done = Reg(False)

    def configure(self, ofmap, reference, image_size, out_chn):
        self.ofmap = ofmap
        self.reference = reference
        self.out_chn = out_chn

        self.image_size = image_size

        self.curr_set = 0
        self.fmap_idx = 0

        self.pass_done.wr(False)

    def tick(self):
        if self.pass_done.rd():
            return

        out_sets = self.out_chn // self.block_size
        fmap_per_iteration = self.image_size[0] * self.image_size[1]

        if self.arch_output_chn.valid():
            rcvd = self.arch_output_chn.pop()
            loc_tag = [e[0] for e in rcvd]
            data = [e[1] for e in rcvd]

            #print(loc_tag)

            x = loc_tag[0] // self.image_size[1]
            y = loc_tag[0] % self.image_size[1]
            #x = self.fmap_idx % self.image_size[0]
            #y = self.fmap_idx // self.image_size[0]
            self.fmap_idx = x + y * self.image_size[0]

            #print("{},{} received (output deserializer)".format(x,y))
            #print(data)

            if self.curr_set < out_sets:
                cmin = self.curr_set * self.block_size
                cmax = cmin + self.block_size
                for c in range(cmin, cmax):
                    assert (self.ofmap[x, y, c] == 0
                            )  # should never replace an existing value
                    self.ofmap[x, y, c] = data[c - cmin]
            self.curr_set += 1

            if self.curr_set == out_sets:
                self.curr_set = 0
                self.fmap_idx += 1
            if self.fmap_idx == fmap_per_iteration:
                self.fmap_idx = 0
                self.pass_done.wr(True)
                raise Finish("Done processing")
예제 #17
0
class InputSerializer(Module):
    def instantiate(self, arch_input_chn, arr_y, block_size, num_nonzero,
                    pruner_name):
        # PE static configuration (immutable)
        #self.arr_x = arr_x
        self.arr_y = arr_y
        #self.chn_per_word = chn_per_word
        self.block_size = block_size
        self.num_nonzero = num_nonzero

        self.convert_chn = Channel()
        self.prune_chn = Channel()
        self.arch_input_chn = arch_input_chn

        # Although both InputSerializer and pruner will be pushing to arch_input_chn
        # There is no conflict issue because all weights will be pushed by IS first
        # then all inputs by pruner
        self.converter = Converter(self.convert_chn, self.prune_chn, \
            self.block_size, self.block_size)
        # self.pruner = NaivePruner(self.prune_chn,self.arch_input_chn, \
        #     self.num_nonzero,True)

        #user defined pruner for this layer, default to naive pruner
        self.pruner = getattr(pruner, pruner_name)(self.prune_chn,self.arch_input_chn, \
            self.num_nonzero, self.block_size, True)

        self.ifmap = None
        self.weights = None
        self.bias = None

        self.image_size = (0, 0)
        self.filter_size = (0, 0)

        self.ifmap_psum_done = True
        self.pass_done = Reg(False)

        # State Counters
        self.curr_set = 0
        self.curr_filter = 0
        self.iteration = 0
        self.fmap_idx = 0
        self.curr_chn = 0
        self.curr_x = 0  # run through first two dimensions of input
        self.curr_y = 0
        self.bias_set = 0
        #self.send_bias = False

    def configure(self, ifmap, weights, bias, in_chn, out_chn, image_size,
                  filter_size):
        self.ifmap = ifmap
        self.weights = weights
        self.bias = bias

        self.in_chn = in_chn
        self.out_chn = out_chn

        self.image_size = image_size
        self.filter_size = filter_size

        self.ifmap_psum_done = False
        self.weights_done = False
        self.pass_done.wr(False)

        # State Counters
        self.curr_set = 0
        self.curr_filter = 0
        self.iteration = 0
        self.curr_chn = 0
        self.curr_x = 0  # run through first two dimensions of input
        self.curr_y = 0
        self.bias_set = 0
        #self.send_bias = False

    def tick(self):
        if self.pass_done.rd():
            return

        if self.ifmap_psum_done:
            if self.convert_chn.vacancy():
                data = np.zeros(self.block_size)
                self.convert_chn.push(data)
            return

        in_sets = self.in_chn // self.block_size
        out_sets = self.out_chn // self.block_size
        num_iteration = self.filter_size[0] * self.filter_size[1]

        # read and hold all weights at the beginning for ease of implementation
        if not self.weights_done:
            f_x = self.iteration // self.filter_size[0]
            f_y = self.iteration % self.filter_size[0]

            # Push filters to PE columns. (PE is responsible for pop)
            if self.arch_input_chn.vacancy(
            ) and self.iteration < num_iteration:
                cmin = self.curr_filter * self.block_size
                cmax = cmin + self.block_size
                data = np.array([self.weights[f_x, f_y, self.curr_chn, c] \
                        for c in range(cmin, cmax) ])
                #print("{},{},{},{}-{}".format(f_x,f_y,self.curr_chn,cmin,cmax))
                #print(data)
                self.arch_input_chn.push(
                    data)  # Gives groups of four along num_filters axis

                self.curr_filter += 1
                if (self.curr_filter == out_sets
                    ):  # Loop through blocks of filters
                    self.curr_filter = 0
                    self.curr_chn += 1
                if (self.curr_chn == self.in_chn):  # Loop through channels
                    self.curr_chn = 0
                    self.iteration += 1
                if (self.iteration == num_iteration
                    ):  # Loop through 2D filter support
                    self.iteration = 0
                    #print("Weights done")
                    self.weights_done = True

        elif self.arch_input_chn.vacancy() and self.bias_set < out_sets:
            cmin = self.bias_set * self.block_size
            cmax = cmin + self.block_size
            data = np.array([self.bias[c] for c in range(cmin, cmax)])
            #print("bias (input serializer):")
            #print(data)
            self.arch_input_chn.push(data)
            self.bias_set += 1
        elif not self.ifmap_psum_done:
            if self.convert_chn.vacancy():
                cmin = self.curr_set * self.block_size
                cmax = cmin + self.block_size

                #xmin = x
                #xmax = x+self.arr_x
                # Write ifmap to glb
                #data = np.array([ self.ifmap[x, self.curr_y, self.curr_chn] for x in range(xmin, xmax) ])
                data = np.array([
                    self.ifmap[self.curr_x, self.curr_y, c]
                    for c in range(cmin, cmax)
                ])
                #print("{},{},{}-{}".format(self.curr_x, self.curr_y, cmin, cmax))
                #print(data)

                self.curr_set += 1
                if (self.curr_set == in_sets):
                    self.curr_set = 0
                    self.curr_y += 1
                if (self.curr_y == self.image_size[1]):
                    self.curr_y = 0
                    self.curr_x += 1

                self.convert_chn.push(data)

                if (self.curr_x == self.image_size[0]):
                    self.curr_x = 0
                    self.ifmap_psum_done = True
예제 #18
0
class OutputDeserializer(Module):
    def instantiate(self, arch_output_chn, arr_x, arr_y, chn_per_word):
        # PE static configuration (immutable)
        self.arr_x = arr_x
        self.arr_y = arr_y
        self.chn_per_word = chn_per_word

        self.arch_output_chn = arch_output_chn

        self.ofmap = None
        self.reference = None

        self.image_size = (0, 0)

        self.curr_set = 0
        self.fmap_idx = 0

        self.pass_done = Reg(False)

    def configure(self, ofmap, reference, image_size):
        self.ofmap = ofmap
        self.reference = reference

        self.image_size = image_size

        self.curr_set = 0
        self.fmap_idx = 0

        self.pass_done.wr(False)

    def tick(self):
        if self.pass_done.rd():
            return

        # How many psums packets we expect to receive
        out_sets = self.arr_x // self.chn_per_word
        fmap_per_iteration = self.image_size[0] * self.image_size[1]

        if self.arch_output_chn.valid():
            data = [e for e in self.arch_output_chn.pop()]

            # Calculate the end coords of where these ending psums must go...
            x = self.fmap_idx % self.image_size[0]
            y = self.fmap_idx // self.image_size[0]

            if self.curr_set < out_sets:
                cmin = self.curr_set * self.chn_per_word
                cmax = cmin + self.chn_per_word
                for c in range(cmin, cmax):
                    self.ofmap[x, y, c] = data[c - cmin]
            self.curr_set += 1

            # After recieving all the elements for pixel 0, do pixel 1, etc...
            if self.curr_set == out_sets:
                self.curr_set = 0
                self.fmap_idx += 1
            if self.fmap_idx == fmap_per_iteration:
                self.fmap_idx = 0
                self.pass_done.wr(True)
                if np.all(self.ofmap == self.reference):
                    raise Finish("Success")
                else:
                    print(self.ofmap)
                    print(self.reference)
                    print(self.ofmap - self.reference)
                    raise Finish("Validation Failed")