Пример #1
0
 def get_mem_write_cycles(self, src, size):
     """
     Write instruction
     args:
         src_idx: index of source address
         src: destination address
         size: size of data in bits
     """
     return ceil_a_by_b(size, self.mem_if_width)
Пример #2
0
 def get_mem_read_cycles(self, dst, size):
     """
     Read instruction
     args:
         src_idx: index of source address
         dst: destination address
         size: size of data in bits
     """
     return ceil_a_by_b(size, self.mem_if_width)
Пример #3
0
    def get_compute_cycles(self,
                           ic,
                           oc,
                           ow,
                           oh,
                           b,
                           kw,
                           kh,
                           iprec,
                           wprec,
                           im2col=False):
        """
        Compute instruction
        args:
            ic: Input Channels
            oc: Output Channels
            ow: Output Width
            oh: Output Height
            kw: Output Height
            kh: Output Height
            b: Batch Size
            im2col: boolean. If true, we assume the cpu does im2col. Otherwise,
                    we do convolutions channel-wise
        """
        overhead = 0
        if im2col:
            ni = kw * kh * ic
            no = oc
            batch = b * oh * ow
            compute_cycles = batch * ceil_a_by_b(no, self.M) * \
                    (ceil_a_by_b(ni, self.N * self.get_perf_factor(iprec, wprec)) + overhead)
        else:
            compute_cycles = b * ceil_a_by_b(oc, self.M) * \
                    ow * oh * kw * kh * \
                    (ceil_a_by_b(ic, self.N * self.get_perf_factor(iprec, wprec)) + overhead)

        return compute_cycles
Пример #4
0
def _optimize_for_order(conv_params, order_type, verbose=False):
    """
    For a given ordering, optimizes tiling
    Args:
        conv_params: A tuple with convolution params
        order_type: ordering loop
    """
    acc_obj, K, O, S, IC, OC, B, iprec, wprec, im2col, energy_cost = conv_params
    I = (O - 1) * S + K

    # We do not tile the "K" dimension and compute an entire 2-D conv at a
    # time
    num_O_tiles = int(math.ceil(log2(O))) + 1
    num_IC_tiles = int(math.ceil(log2(IC))) + 1

    # TODO: Fix?
    if im2col:
        num_OC_tiles = int(math.ceil(log2(OC))) + 1
    else:
        num_OC_tiles = int(math.ceil(log2(math.ceil(
            float(OC) / acc_obj.M)))) + 1

    num_B_tiles = int(math.ceil(log2(B))) + 1

    best_cycles = None
    best_energy = None
    best_tiling = None

    for _b in range(num_B_tiles):
        b = min(1 << _b, B)
        num_b = ceil_a_by_b(B, b)

        for _o in range(num_O_tiles):
            ow = min(1 << _o, O)
            oh = ow
            num_ow = ceil_a_by_b(O, ow)
            num_oh = ceil_a_by_b(O, oh)

            for _ic in range(num_IC_tiles):
                ic = min(1 << _ic, IC)
                num_ic = ceil_a_by_b(IC, ic)

                for _oc in range(num_OC_tiles):

                    if im2col:
                        oc = min((1 << _oc), OC)
                    else:
                        oc = min((1 << _oc) * acc_obj.M, OC)

                    num_oc = ceil_a_by_b(OC, oc)

                    iw = K + (ow - 1) * S
                    ih = K + (oh - 1) * S

                    tiling = {}
                    tiling['B/b'] = (num_b, b)
                    tiling['OW/ow'] = (num_ow, ow)
                    tiling['OH/oh'] = (num_oh, oh)
                    tiling['IC/ic'] = (num_ic, ic)
                    tiling['OC/oc'] = (num_oc, oc)

                    stats = get_stats_fast(conv_params,
                                           tiling,
                                           order_type,
                                           verbose=False)

                    if stats is None:
                        continue

                    cycles = stats.total_cycles
                    energy = stats.get_energy(energy_cost)
                    mem_cycles = stats.mem_stall_cycles

                    if best_cycles is None or best_cycles > cycles or (
                            best_cycles == cycles and best_energy > energy):
                        # if best_energy is None or best_energy > energy or (best_energy == energy and best_cycles > cycles):
                        best_energy = energy
                        best_cycles = cycles
                        best_mem_cycles = mem_cycles
                        best_order = order_type
                        best_tiling = tiling

    # if best_cycles is None:
    # print('Not found')
    # print(conv_params)
    # stats = get_stats_fast(conv_params, tiling, order_type, verbose=True)

    return (best_tiling, order_type, best_cycles, best_energy)
Пример #5
0
def get_stats_fast(conv_params, tiling, order_type, verbose=False):
    """
    Returns cycles and memory accesses to DRAM, IBUF, OBUF, and WBUF
        TODOs: Without im2col, the calculation of weight and act size is inexact
    """
    acc_obj, K, O, S, IC, OC, B, iprec, wprec, im2col, energy_cost = conv_params

    num_b, b = tiling['B/b']
    num_ow, ow = tiling['OW/ow']
    num_oh, oh = tiling['OH/oh']
    num_ic, ic = tiling['IC/ic']
    num_oc, oc = tiling['OC/oc']

    kw = kh = K

    perf_factor = acc_obj.get_perf_factor(iprec, wprec)

    writes = {}
    reads = {}

    if im2col:
        writes['wgt'] = \
                ceil_a_by_b(K * K * ic, acc_obj.N * perf_factor) * acc_obj.N * perf_factor * \
                oc * \
                wprec # ceil_a_by_b(oc, acc_obj.M) * acc_obj.M * \
    else:
        #TODO: Figure this out
        writes['wgt'] = \
                ceil_a_by_b(K * K * ic, acc_obj.N * perf_factor) * acc_obj.N * perf_factor * \
                oc * \
                wprec # ceil_a_by_b(oc, acc_obj.M) * acc_obj.M * \
    if im2col:
        writes['act'] = ow * oh * \
                K * K * ic * \
                b * iprec # ceil_a_by_b(K * K * ic, acc_obj.N * perf_factor) * acc_obj.N * perf_factor * \
    else:
        #TODO: Figure this out
        iw = K + (ow - 1) * S
        ih = K + (oh - 1) * S
        writes['act'] = iw * ih * ic * b * iprec

    oprec = 32
    writes['out'] = ow * oh * ceil_a_by_b(oc,
                                          acc_obj.M) * acc_obj.M * b * oprec
    reads['out'] = ow * oh * ceil_a_by_b(oc, acc_obj.M) * acc_obj.M * b * oprec

    # Skip if overutilizing resources
    # TODO check bytes/bits
    overflow = False
    if writes['wgt'] > acc_obj.sram['wgt'] * 8 / 2:
        if verbose:
            print('wgt overflow: {}'.format(writes['wgt']))
            print(b, ow, oh, ic, oc)
        overflow = True
    if writes['act'] > acc_obj.sram['act'] * 8 / 2:
        if verbose:
            print('act overflow')
            print(b, ow, oh, ic, oc)
        overflow = True
    if writes['out'] > acc_obj.sram['out'] * 8 / 2:
        if verbose:
            print('out overflow')
            print(b, ow, oh, ic, oc)
        overflow = True
    if overflow:
        if verbose:
            print('Activation size: {} bytes'.format(writes['act'] / 8.))
            print('Weights size: {} bytes'.format(writes['wgt'] / 8.))
            print('Output size: {} bytes'.format(writes['out'] / 8.))
        return

    max_write_size = {}
    max_read_size = {}
    for namespace in writes:
        max_write_size[namespace] = writes[namespace]
    for namespace in reads:
        max_read_size[namespace] = reads[namespace]

    # First the loop block optimizations
    stats = Stats()
    write_promote = {'wgt': True, 'act': True, 'out': True}
    read_promote = {'out': True}
    if verbose:
        logger.debug('Initialize reads/writes')
        logger.debug('\tim2col: {}'.format(im2col))
        logger.debug('\tTiling: {}'.format(tiling))
        logger.debug('\tReads : {}'.format(reads))
        logger.debug('\tWrites: {}'.format(writes))
    for loop in reversed(order_type):
        num_tiles, tile_size = tiling[loop]
        # promote all writes
        for namespace in writes:
            # promote is true
            if write_promote[namespace]:
                # If tile loop depends on the namespace index, make the read size larger
                if tile_deps[loop][namespace]:
                    writes[namespace] *= num_tiles
                    # If tile size is larger than the SRAM, set promote to False
                    if writes[namespace] > acc_obj.sram[namespace] * 8. / 2:
                        write_promote[namespace] = False
                    else:
                        max_write_size[namespace] = writes[namespace]
            else:
                writes[namespace] *= num_tiles

        # promote all reads
        for namespace in reads:
            # promote is true
            if read_promote[namespace]:
                # Tile loop depends on the namespace index
                if tile_deps[loop][namespace]:
                    reads[namespace] *= num_tiles
                    # Tile size is now larger than the SRAM, set promote to False
                    if reads[namespace] > acc_obj.sram[namespace] * 8. / 2:
                        read_promote[namespace] = False
                    else:
                        max_read_size[namespace] = writes[namespace]
            else:
                reads[namespace] *= num_tiles

        if verbose:
            logger.debug('Loop: {}'.format(loop))
            logger.debug('\tLoop range: {}'.format(tiling[loop]))
            logger.debug('\tMax write size: {}'.format(max_write_size))
            logger.debug('\tMax read size: {}'.format(max_read_size))
            logger.debug('\tLoop Dependencies: {}'.format(tile_deps[loop]))
            logger.debug('\tLoop Promote: {}'.format(write_promote))
            logger.debug('\tReads : {}'.format(reads))
            logger.debug('\tWrites: {}'.format(writes))

    for namespace in writes:
        stats.writes[namespace] = writes[namespace]
        stats.reads['dram'] += writes[namespace]
    for namespace in reads:
        stats.reads[namespace] = reads[namespace]
        stats.writes['dram'] += reads[namespace]

    # Next the inner loop optimizations
    if im2col:
        # With im2col, loops are:
        # (os_loop: ic x kh x kw): Wgt: True, Out: False, Act: True
        # (ws_loop: b x oh x ow): Wgt: False, Out: True, Act: True
        # (is_loop: oc): Wgt: True, Out: True, Act: False
        is_loop = ceil_a_by_b(oc, acc_obj.M) * acc_obj.M
        os_loop = ceil_a_by_b(
            ic * kh * kw, acc_obj.N * acc_obj.get_perf_factor(iprec, wprec)
        ) * acc_obj.N * acc_obj.get_perf_factor(iprec, wprec)
        ws_loop = b * oh * ow
        # Input Stationary energy
        # kw * kh * ic * oh * ow * b -> oc
        is_energy = (os_loop * ws_loop) * (iprec + is_loop * (wprec + oprec))
        # Output Stationary energy
        # oc * oh * ow * b -> kw * kh * ic
        os_energy = (is_loop * ws_loop) * (oprec + os_loop * (iprec + wprec))
        # Weight Stationary energy
        # kw * kh * ic * oc -> b * ow * oh
        ws_energy = (os_loop * is_loop) * (wprec + ws_loop * (iprec + oprec))
    else:
        is_loop = ceil_a_by_b(oc, acc_obj.M) * acc_obj.M
        os_loop = ceil_a_by_b(
            ic, acc_obj.N * acc_obj.get_perf_factor(iprec, wprec)
        ) * acc_obj.N * acc_obj.get_perf_factor(iprec, wprec) * kh * kw
        ws_loop = b * oh * ow
        # Input Stationary energy
        # kw * kh * ic * oh * ow * b -> oc
        is_energy = (os_loop * ws_loop) * (iprec + is_loop * (wprec + oprec))
        # Output Stationary energy
        # oc * oh * ow * b -> kw * kh * ic
        os_energy = (is_loop * ws_loop) * (oprec + os_loop * (iprec + wprec))
        # Weight Stationary energy
        # kw * kh * ic * oc -> b * ow * oh
        ws_energy = (os_loop * is_loop) * (wprec + ws_loop * (iprec + oprec))

    min_energy = min(is_energy, ws_energy, os_energy)
    num_tiles = num_b * num_ow * num_oh * num_ic * num_oc

    if is_energy == min_energy:
        if verbose:
            logger.debug('SRAM access order: Input Stationary')
        stats.reads['act'] += num_tiles * (kw * kh * ic * oh * ow * b) * iprec
        stats.reads['out'] += num_tiles * (kw * kh * ic * oh * ow *
                                           b) * oc * oprec
        stats.writes['out'] += num_tiles * (kw * kh * ic * oh * ow *
                                            b) * oc * oprec
        stats.reads['wgt'] += num_tiles * (kw * kh * ic * oh * ow *
                                           b) * oc * wprec

    elif os_energy == min_energy:
        if verbose:
            logger.debug('SRAM access order: Output Stationary')
        stats.reads['act'] += num_tiles * (oc * oh * ow * b) * (kw * kh *
                                                                ic) * iprec
        stats.reads['out'] += num_tiles * (oc * oh * ow * b) * oprec
        stats.writes['out'] += num_tiles * (oc * oh * ow * b) * oprec
        stats.reads['wgt'] += num_tiles * (oc * oh * ow * b) * (kw * kh *
                                                                ic) * wprec

    else:
        if verbose:
            logger.debug('SRAM access order: Weight Stationary')
        stats.reads['act'] += num_tiles * (kw * kh * ic * oc) * (b * ow *
                                                                 oh) * iprec
        stats.reads['out'] += num_tiles * (kw * kh * ic * oc) * (b * ow *
                                                                 oh) * oprec
        stats.writes['out'] += num_tiles * (kw * kh * ic * oc) * (b * ow *
                                                                  oh) * oprec
        stats.reads['wgt'] += num_tiles * (kw * kh * ic * oc) * wprec

    # TODO: update
    initial_dram_reads = 0
    final_dram_writes = 0
    for namespace in max_write_size:
        initial_dram_reads += max_write_size[namespace]
    for namespace in max_read_size:
        final_dram_writes += max_read_size[namespace]
    latency = acc_obj.get_mem_read_cycles('dram', initial_dram_reads) + \
            acc_obj.get_mem_write_cycles('dram', final_dram_writes)

    total_dram_accesses = stats.reads['dram'] + stats.writes['dram']
    middle_dram_accesses = total_dram_accesses - initial_dram_reads - final_dram_writes

    compute_cycles = num_tiles * acc_obj.get_compute_cycles(
        ic, oc, ow, oh, b, kw, kh, iprec, wprec, im2col)
    memory_cycles_required = ceil_a_by_b(middle_dram_accesses,
                                         acc_obj.mem_if_width)

    memory_stalls = max(0, memory_cycles_required - compute_cycles) + latency
    stats.total_cycles = compute_cycles + memory_stalls
    stats.mem_stall_cycles = memory_stalls

    if verbose:
        logger.debug('Compute cycles : {:>20,}'.format(compute_cycles))
        logger.debug('Memory cycles  : {:>20,}'.format(memory_cycles_required +
                                                       latency))
        logger.debug('Memory stalls  : {:>20,}'.format(memory_stalls))

    return stats
Пример #6
0
def get_loop_instructions(conv_params, tiling, order_type):
    acc_obj, K, O, S, IC, OC, B, iprec, wprec, im2col, energy_cost = conv_params
    I = (O - 1) * S + K

    num_b, b = tiling['B/b']
    num_ow, ow = tiling['OW/ow']
    num_oh, oh = tiling['OH/oh']
    num_ic, ic = tiling['IC/ic']
    num_oc, oc = tiling['OC/oc']

    instructions = {}
    instructions['B/b'] = [num_b, I * I * IC * b, 0, O * O * OC * b]
    instructions['OW/ow'] = [num_ow, ow * S, 0, ow]
    instructions['OH/oh'] = [num_oh, I * S, 0, O]
    instructions['IC/ic'] = [num_ic, I * I * ic, K * K * ic, 0]
    instructions['OC/oc'] = [num_oc, 0, K * K * IC * oc, O * O * oc]

    instruction_ordered = LoopStack()
    wgt_stride = []
    act_stride = []
    out_stride = []
    count = 0
    for o in order_type:
        ins = instructions[o]
        if ins[0] > 1:
            stride = {'wgt': ins[2], 'act': ins[1], 'out': ins[3]}
            instruction_ordered.insert_loop(ins[0],
                                            stride=stride,
                                            level=count,
                                            name=o)
            wgt_stride.append(stride['wgt'])
            act_stride.append(stride['act'])
            out_stride.append(stride['out'])
            count += 1
    if count == 0:
        ins = instructions[o]
        stride = {'wgt': ins[2], 'act': ins[1], 'out': ins[3]}
        instruction_ordered.insert_loop(ins[0],
                                        stride=stride,
                                        level=count,
                                        name=o)
        wgt_stride.append(stride['wgt'])
        act_stride.append(stride['act'])
        out_stride.append(stride['out'])
        count += 1

    iw = K + (ow - 1) * S
    ih = K + (oh - 1) * S

    I = K + (O - 1) * S

    if im2col:
        wgt_read_size = \
                ceil_a_by_b(K * K * ic, acc_obj.N) * acc_obj.N * oc * \
                wprec
        max_wgt_size = \
                ceil_a_by_b(K * K * IC, acc_obj.N) * acc_obj.N * OC * wprec
    else:
        wgt_read_size = \
                ceil_a_by_b(K * K * ic, acc_obj.N) * acc_obj.N * \
                ceil_a_by_b(oc, acc_obj.M) * acc_obj.M * \
                wprec
        max_wgt_size = \
                ceil_a_by_b(K * K * IC, acc_obj.N) * acc_obj.N * \
                ceil_a_by_b(OC, acc_obj.M) * acc_obj.M * wprec

    if im2col:
        act_read_size = ow * oh * \
                ceil_a_by_b(K * K, acc_obj.N) * \
                b * iprec * acc_obj.N
        max_act_size = B * O * O * \
                ceil_a_by_b(K * K, acc_obj.N) * acc_obj.N * \
                iprec
    else:
        act_read_size = iw * ih * ic * b * iprec
        max_act_size = B * I * I * IC * iprec

    oprec = 32
    out_read_size = ow * oh * oc * b * oprec
    max_out_size = O * O * OC * B * oprec

    # Skip if overutilizing resources (consider double buffering)
    if wgt_read_size > acc_obj.sram['wgt'] * 8 / 2.0:
        print('error')
        return
    if act_read_size > acc_obj.sram['act'] * 8 / 2.0:
        return
    if out_read_size > acc_obj.sram['out'] * 8 / 2.0:
        return

    # Skip tiling if underutilizing resources
    # underutilization_count = 0
    # if act_read_size < 0.5 * acc_obj.sram['act'] and max_act_size >= 0.5 * acc_obj.sram['act']:
    #     underutilization_count += 1
    # if out_read_size < 0.5 * acc_obj.sram['out'] and max_out_size >= 0.5 * acc_obj.sram['out']:
    #     underutilization_count += 1
    # if wgt_read_size < 0.5 * acc_obj.sram['wgt'] and max_wgt_size >= 0.5 * acc_obj.sram['wgt']:
    #     underutilization_count += 1
    # if underutilization_count > 1:
    #     return

    # Memory Instructions
    instruction_ordered.insert_mem_read(name='Wgt RD',
                                        namespace='wgt',
                                        addr=0,
                                        size=wgt_read_size,
                                        stride=wgt_stride,
                                        level=count - 0)
    instruction_ordered.insert_mem_read(name='Act RD',
                                        namespace='act',
                                        addr=0,
                                        size=act_read_size,
                                        stride=act_stride,
                                        level=count - 0)
    instruction_ordered.insert_mem_read(name='Out RD',
                                        namespace='out',
                                        addr=0,
                                        size=out_read_size,
                                        stride=out_stride,
                                        level=count - 0)
    instruction_ordered.insert_mem_write(name='Out WR',
                                         namespace='out',
                                         addr=0,
                                         size=out_read_size,
                                         stride=out_stride,
                                         level=count - 0)
    ni = K * K * ic
    no = oh * ow * oc
    b = b

    instruction_ordered.insert_compute(acc_obj.get_compute_stats, ic, oc, ow,
                                       oh, b, K, K, iprec, wprec, im2col)

    # stats = acc_obj.loop_estimate_stats(instruction_ordered)
    instruction_ordered.promote_mem_ops(acc_obj.sram)

    return instruction_ordered
Пример #7
0
    def get_conv_cycles(self,
                        K,
                        O,
                        S,
                        IC,
                        OC,
                        iprec,
                        wprec,
                        batch_size=1,
                        im2col=False):
        """
        Get number of cycles required for Convolution layer.

        description:
            This functions does an exhaustive search for finding the optimal
            Tiling and Ordering parameters
        """
        B = batch_size
        I = (O - 1) * S + K

        # We do not tile the "K" dimension and compute an entire 2-D conv at a
        # time
        num_O_tiles = int(math.ceil(log2(O))) + 1
        num_IC_tiles = int(math.ceil(log2(IC))) + 1
        num_OC_tiles = int(
            math.ceil(log2(math.ceil(float(OC) / self.accelerator.M)))) + 1
        num_B_tiles = int(math.ceil(log2(B))) + 1

        self.logger.debug('Number of O Tiles: {}'.format(num_O_tiles))
        self.logger.debug('Number of IC Tiles: {}'.format(num_IC_tiles))
        self.logger.debug('Number of OC Tiles: {}'.format(num_OC_tiles))
        self.logger.debug('Number of B Tiles: {}'.format(num_B_tiles))

        best_instructions_dict = {}
        conv_params = self.accelerator, K, O, S, IC, OC, B, iprec, wprec, im2col, self.get_energy_cost(
        )

        best_instructions, best_tiling, best_order = optimize_for_order(
            conv_params)
        stats = get_stats_fast(conv_params,
                               best_tiling,
                               best_order,
                               verbose=False)

        act_reads = stats.reads['act']
        wgt_reads = stats.reads['wgt']
        out_reads = stats.reads['out']
        dram_reads = stats.reads['dram']
        out_writes = stats.writes['out']
        dram_writes = stats.writes['dram']
        best_cycles = stats.total_cycles

        num_ops = O * O * K * K * IC * OC * B

        # self.logger.debug('Best Operations: {}'.format(best_operations))

        self.logger.debug('Conv Layer')
        self.logger.debug('Num of ops: {}'.format(num_ops))
        self.logger.debug('Kernel Size: {}x{}x{}x{}'.format(K, K, IC, OC))
        self.logger.debug('Output Size: {}x{}x{}'.format(O, O, OC))
        self.logger.debug('Stride Size: {}x{}'.format(S, S))
        self.logger.debug('Input  Size: {}x{}x{}'.format(I, I, IC))

        self.logger.debug('Max Precision: {}'.format(self.accelerator.pmax))
        self.logger.debug('Min Precision: {}'.format(self.accelerator.pmin))

        self.logger.debug('Activation Precision: {}'.format(iprec))
        self.logger.debug('Weight Precision: {}'.format(wprec))
        self.logger.debug('Performance Factor: {}'.format(
            self.get_perf_factor(iprec, wprec)))

        self.logger.debug('Total Cycles: {:,}'.format(best_cycles))
        cycles_per_batch = ceil_a_by_b(best_cycles, B)
        self.logger.debug(
            'Total Cycles per batch: {:,}'.format(cycles_per_batch))
        ops_per_cycle = float(num_ops) / best_cycles
        self.logger.debug('Ops/Cycle: {:,.2f}'.format(ops_per_cycle))
        ops_per_cycle_per_pe = float(ops_per_cycle) / (self.accelerator.N *
                                                       self.accelerator.M)
        self.logger.debug('Ops/Cycle/PE: {:,.4}'.format(ops_per_cycle_per_pe))

        return stats, best_instructions
Пример #8
0
    def get_energy_cost(self):

        if self.energy_costs is not None:
            return self.energy_costs

        frequency = self.accelerator.frequency
        ##################################################
        N = self.accelerator.N
        M = self.accelerator.M
        pmax = self.accelerator.pmax
        pmin = self.accelerator.pmin
        wbuf_size = self.accelerator.sram['wgt'] * 8
        ibuf_size = self.accelerator.sram['act'] * 8
        obuf_size = self.accelerator.sram['out'] * 8
        wbuf_bank = N * M
        ibuf_bank = N
        obuf_bank = M
        wbuf_bits = (pmax * pmax / pmin)
        ibuf_bits = (pmax * pmax / pmin)
        obuf_bits = 32
        wbuf_word = ceil_a_by_b(wbuf_size, wbuf_bank * wbuf_bits)
        ibuf_word = ceil_a_by_b(ibuf_size, ibuf_bank * ibuf_bits)
        obuf_word = ceil_a_by_b(obuf_size, obuf_bank * obuf_bits)
        wbuf_bank_size = wbuf_word * wbuf_bits
        ibuf_bank_size = ibuf_word * ibuf_bits
        obuf_bank_size = obuf_word * obuf_bits

        assert wbuf_bank_size * wbuf_bank == wbuf_size
        assert ibuf_bank_size * ibuf_bank == ibuf_size
        assert obuf_bank_size * obuf_bank == obuf_size

        ##################################################
        cfg_dict = {
            'size (bytes)': wbuf_bank_size / 8.,
            'block size (bytes)': wbuf_bits / 8.,
            'read-write port': 0
        }
        wbuf_data = self.sram_obj.get_data_clean(cfg_dict)
        wbuf_read_energy = float(wbuf_data['read_energy_nJ']) / wbuf_bits
        wbuf_write_energy = float(wbuf_data['write_energy_nJ']) / wbuf_bits
        wbuf_leak_power = float(wbuf_data['leak_power_mW']) * wbuf_bank
        wbuf_area = float(wbuf_data['area_mm^2']) * wbuf_bank

        self.logger.debug('WBUF :')
        self.logger.debug(
            '\tBanks                       : {0:>8}'.format(wbuf_bank))
        self.logger.debug(
            '\tBitWidth                    : {0:>8} bits'.format(wbuf_bits))
        self.logger.debug(
            '\tWords                       : {0:>8}'.format(wbuf_word))
        self.logger.debug(
            '\tTotal Size                  : {0:>8} kBytes'.format(wbuf_size /
                                                                   8. / 1024.))
        self.logger.debug(
            '\tTotal Area                  : {0:>8.2f} mm^2'.format(wbuf_area))
        self.logger.debug(
            '\tLeak Energy (per clock)     : {0:>8.4f} mWatt'.format(
                wbuf_leak_power))
        self.logger.debug(
            '\tRead Energy                 : {0:>8.4f} pJ/bit'.format(
                wbuf_read_energy * 1.e3))
        self.logger.debug(
            '\tWrite Energy                : {0:>8.4f} pJ/bit'.format(
                wbuf_write_energy * 1.e3))
        ##################################################
        cfg_dict = {
            'size (bytes)': ibuf_bank_size / 8.,
            'block size (bytes)': ibuf_bits / 8.,
            'read-write port': 0
        }
        ibuf_data = self.sram_obj.get_data_clean(cfg_dict)
        ibuf_read_energy = float(ibuf_data['read_energy_nJ']) / ibuf_bits
        ibuf_write_energy = float(ibuf_data['write_energy_nJ']) / ibuf_bits
        ibuf_leak_power = float(ibuf_data['leak_power_mW']) * ibuf_bank
        ibuf_area = float(ibuf_data['area_mm^2']) * ibuf_bank

        self.logger.debug('IBUF :')
        self.logger.debug(
            '\tBanks                       : {0:>8}'.format(ibuf_bank))
        self.logger.debug(
            '\tBitWidth                    : {0:>8} bits'.format(ibuf_bits))
        self.logger.debug(
            '\tWords                       : {0:>8}'.format(ibuf_word))
        self.logger.debug(
            '\tTotal Size                  : {0:>8} kBytes'.format(ibuf_size /
                                                                   8. / 1024.))
        self.logger.debug(
            '\tTotal Area                  : {0:>8.2f} mm^2'.format(ibuf_area))
        self.logger.debug(
            '\tLeak Energy (per clock)     : {0:>8.4f} mWatt'.format(
                ibuf_leak_power))
        self.logger.debug(
            '\tRead Energy                 : {0:>8.4f} pJ/bit'.format(
                ibuf_read_energy * 1.e3))
        self.logger.debug(
            '\tWrite Energy                : {0:>8.4f} pJ/bit'.format(
                ibuf_write_energy * 1.e3))
        ##################################################
        cfg_dict = {
            'size (bytes)': obuf_bank_size / 8.,
            'block size (bytes)': obuf_bits / 8.,
            'read-write port': 1
        }
        obuf_data = self.sram_obj.get_data_clean(cfg_dict)
        obuf_read_energy = float(obuf_data['read_energy_nJ']) / obuf_bits
        obuf_write_energy = float(obuf_data['write_energy_nJ']) / obuf_bits
        obuf_leak_power = float(obuf_data['leak_power_mW']) * obuf_bank
        obuf_area = float(obuf_data['area_mm^2']) * obuf_bank

        self.logger.debug('OBUF :')
        self.logger.debug(
            '\tBanks                       : {0:>8}'.format(obuf_bank))
        self.logger.debug(
            '\tBitWidth                    : {0:>8} bits'.format(obuf_bits))
        self.logger.debug(
            '\tWords                       : {0:>8}'.format(obuf_word))
        self.logger.debug(
            '\tTotal Size                  : {0:>8} kBytes'.format(obuf_size /
                                                                   8. / 1024.))
        self.logger.debug(
            '\tTotal Area                  : {0:>8.2f} mm^2'.format(obuf_area))
        self.logger.debug(
            '\tLeak Energy (per clock)     : {0:>8.4f} mWatt'.format(
                obuf_leak_power))
        self.logger.debug(
            '\tRead Energy                 : {0:>8.4f} pJ/bit'.format(
                obuf_read_energy * 1.e3))
        self.logger.debug(
            '\tWrite Energy                : {0:>8.4f} pJ/bit'.format(
                obuf_write_energy * 1.e3))
        ##################################################
        # Get stats for systolic array
        core_csv = os.path.join('./results', 'systolic_array_synth.csv')
        core_synth_data = pandas.read_csv(core_csv)

        lookup_dict = {}
        lookup_dict['Max Precision (bits)'] = pmax
        lookup_dict['Min Precision (bits)'] = pmin
        lookup_dict['N'] = N
        lookup_dict['M'] = M
        core_data = lookup_pandas_dataframe(core_synth_data, lookup_dict)
        if len(core_data) == 0:
            lookup_dict['N'] = 4
            lookup_dict['M'] = 4
            core_data = lookup_pandas_dataframe(core_synth_data, lookup_dict)
            assert len(core_data) == 1
            core_area = float(core_data['Area (um^2)']) * 1.e-6 * (N * M) / 16.
            core_dyn_power = float(
                core_data['Dynamic Power (nW)']) * (N * M) / 16.
            core_dyn_energy = core_dyn_power / float(core_data['Frequency'])
            core_leak_power = float(
                core_data['Leakage Power (nW)']) * (N * M) / 16.
            core_leak_energy = core_leak_power / float(core_data['Frequency'])
        else:
            core_area = float(core_data['Area (um^2)']) * 1.e-6
            core_dyn_power = float(core_data['Dynamic Power (nW)'])
            core_dyn_energy = core_dyn_power / float(core_data['Frequency'])
            core_leak_power = float(core_data['Leakage Power (nW)'])
            core_leak_energy = core_leak_power / float(core_data['Frequency'])
        self.logger.debug('Core :')
        self.logger.debug(
            '\tDimensions              : {0}x{1}-systolic array'.format(N, M))
        self.logger.debug('\tMax-Precision           : {}'.format(pmax))
        self.logger.debug('\tMin-Precision           : {}'.format(pmin))
        self.logger.debug(
            '\tLeak power              : {} (nW)'.format(core_leak_energy))
        self.logger.debug(
            '\tDynamic Energy (nJ)     : {}'.format(core_dyn_energy))
        self.logger.debug('\tArea (mm^2)             : {}'.format(core_area))
        ##################################################

        energy_tuple = EnergyTuple(core_dyn_energy, wbuf_read_energy,
                                   wbuf_write_energy, ibuf_read_energy,
                                   ibuf_write_energy, obuf_read_energy,
                                   obuf_write_energy)

        return energy_tuple