def __init__(self, analysis_case: AnalysisCase, arch_case: Design): super(OverheadEval, self).__init__(analysis_case, arch_case) self.total_memory_access = self.get_total_memory_access() self.OL2_size: size.Size = size.B((cfg.OL1_choices_map[self.memcase.OL1].W \ * cfg.OL1_choices_map[self.memcase.OL1].H) * self.computecase.lane * self.computecase.core * self.computecase.chiplet) self.real_WL1 = size.B(self.memcase.WL1.to_b() / 8 / (self.memcase.loopParameter.chiplet_spatial_parameter.Wc \ * self.memcase.loopParameter.chiplet_spatial_parameter.Hc))
def get_mem_footprint(self): A_l1_memory = (self.memcase.AL1.to_b() * self.computecase.core) W_l1_memory = self.memcase.WL1.to_b() / ( self.memcase.loopParameter.chiplet_spatial_parameter.Wc \ * self.memcase.loopParameter.chiplet_spatial_parameter.Hc) * self.computecase.lane * self.computecase.core A_l2_memory = self.memcase.AL2.to_b() o_l1_memory = size.B( 192).to_b() * self.computecase.lane * self.computecase.core o_l2_memory = (size.B(8 * 8).to_b() * self.computecase.lane * self.computecase.core) total_memory = self.computecase.chiplet * (A_l1_memory + W_l1_memory + A_l2_memory + o_l1_memory + o_l2_memory) / (8192) return total_memory
def get_chiplet_communication(self): C0, C1, W1, W2, H1, H2, K1, K2 = self.memcase.loopParameter.get_temporal_count( ) Kp, Hp, _, Wc, Hc = self.memcase.loopParameter.get_spatial_count() stride = self.memcase.workload.stride kernel_size = self.memcase.workload.kernel_size OL1 = cfg.OL1_choices_map[self.memcase.OL1] # MUSE-V3 does not support weight roration reuse W: size.Size = size.B(0) if self.memcase.loopParameter.rotation_enable: A: size.Size = size.B(2 * Kp * (Kp - 1) * OL1.in_tile(kernel_size, stride).size() \ * C0 * C1 * K2 * K1 * Hp * W1 * H1 * W2 * H2 * Wc * Hc) else: A: size.Size = size.B(0) return W + A
def get_analysis_cases(design: Design, workload: Workload) -> List[AnalysisCase]: from config import ACT_MEMORY_ALIGN, rotation_search_list from config import AL1_choices, WL1_choices, OL1_choices_map, AL2_choices, W1H1_choices_map result: List[AnalysisCase] = [] # To pick the corresponding package-spatial division cases package_spatial_parameter_map = get_package_spatial_parameters() package_spatial_parameters = package_spatial_parameter_map[design.chiplet] K0 = design.lane for package_spatial_parameter in package_spatial_parameters: Kp = package_spatial_parameter.Kp Hp = package_spatial_parameter.Hp Fw = workload.kernel_size.W Fh = workload.kernel_size.H C0 = design.vector chiplet_workload_W: int = ceil(workload.out_size.W / 1) chiplet_workload_H: int = ceil(workload.out_size.H / Hp) chiplet_workload_K: int = ceil(workload.out_channel / Kp) for WL1 in WL1_choices: # This loop is no meaning is the post-design flow (i.e., for MUSE-V3) # Chiplet-level spatial division: Kc, Hc, Wc chiplet_spatial_parameter_map = get_chiplet_spatial_parameters() chiplet_spatial_parameters = chiplet_spatial_parameter_map[ design.core] # Temp WL1 to avoid overwrite the original value in the following iteration WL1_temp = WL1 for chiplet_spatial_parameter in chiplet_spatial_parameters: # To generate different packae-level spatial division Kc = chiplet_spatial_parameter.Kc Wc = chiplet_spatial_parameter.Wc Hc = chiplet_spatial_parameter.Hc # In MUSE-V3, if weight data can be shared by multi-cores, their local WL1 can be fused to # form a larger buffer. Wc * Hc refers to the number of shared cores WL1 = WL1_temp * Wc * Hc for OL1, BasicSize in OL1_choices_map.items( ): # To generate different basic output-tile size OL1: size.Size for AL2 in AL2_choices: # This loop is no meaning is the post-design flow (i.e., for MUSE-V3) AL2: size.Size chiplet_workload_in = TileSize( chiplet_workload_W, chiplet_workload_H).in_tile( workload.kernel_size, workload.stride) # Adapt multiple basic-tile (mini-tile) for a core (plane dimension) n = 1 # TODO: To support more cases for the total number of mini-tiles (HW can do it) for i in [ 2, 4, 8, 16 ]: # To find the different Level-1 temporal cases (for H & W) W1H1_choice = W1H1_choices_map[i] sub_tile_in = TileSize( BasicSize.H * W1H1_choice.H, BasicSize.W * W1H1_choice.W).in_tile( workload.kernel_size, workload.stride) tile_in = TileSize(Hc * sub_tile_in.H, Wc * sub_tile_in.W).in_tile( workload.kernel_size, workload.stride) # To check whether the tile size is larger than the chiplet workload if tile_in.W > chiplet_workload_in.W or tile_in.H > chiplet_workload_in.H: break # The tile_in needs to be fit in the AL2 # Align by the vector_size if AL2 >= size.B( tile_in.W * tile_in.H * ceil(workload.in_channel / C0) * C0): n = i else: break W1H1_choice = W1H1_choices_map[n] W1 = W1H1_choice.W H1 = W1H1_choice.H # Adapt multiple basic-tile (mini-tile) for a core (channel dimension) K1 = 1 # Initialization # It can be larger # TODO: j can be any integer, e.g., j=3. But it needs to handle the margin case. for j in [ 1, 2, 4, 8, 16 ]: # To find the different Level-1 temporal cases (for K) # To check whether the tile channel is larger than the chiplet workload if Kp * Kc * K0 * j >= chiplet_workload_K: break # The tile_in needs to be fit in the WL1 # Align by the vector_size if WL1 >= size.B( Fw * Fh * ceil(workload.in_channel / C0) * C0 * j): K1 = j else: break # Generate Loop Parameters and Analysis Cases tile_W = BasicSize.W * W1 * Wc tile_H = BasicSize.H * H1 * Hc tile_K = K0 * K1 * Kc W2: int = ceil(chiplet_workload_W / tile_W) H2: int = ceil(chiplet_workload_H / tile_H) K2: int = ceil(chiplet_workload_K / tile_K) for rotation_enable in rotation_search_list: # To generate rotation or non-rotation cases if rotation_enable: if workload.in_channel % (ACT_MEMORY_ALIGN * design.chiplet) != 0: continue else: aligned_CI = ceil(workload.in_channel / (C0 * Kp)) * C0 * Kp else: aligned_CI = ceil( workload.in_channel / C0) * C0 aligned_workload = Workload( aligned_CI, tile_K * K2 * Kp, Block(tile_H * H2, tile_W * W2), workload.kernel_size, workload.stride) loopParameter = LoopParameter(W1, H1, K1, W2, H2, K2, package_spatial_parameter, \ chiplet_spatial_parameter, aligned_workload, design, rotation_enable) for AL1 in AL1_choices: # This loop is no meaning is the post-design flow (i.e., for MUSE-V3) if AL1 > AL2: continue for reorderCase in [ ReorderCase(type_n) for type_n in [1, 2] ]: # To generate different reordering cases result.append( AnalysisCase(OL1, AL1, WL1, AL2, reorderCase, loopParameter, aligned_workload)) return result
# 8b-A, 4b-W Mode Configurations: # num_chiplets: List[int] = [4] # num_cores: List[int] = [8] # num_lanes: List[int] = [16] # size_vectors: List[int] = [8] # 4b-A, 4b-W Mode Configurations: # num_chiplets: List[int] = [4] # num_cores: List[int] = [8] # num_lanes: List[int] = [16] # size_vectors: List[int] = [16] # 8b-A, 8b-W Mode Configurations: AL2_choices: List[size.Size] = [size.B(46080)] AL1_choices: List[size.Size] = [size.B(8192)] WL1_choices: List[size.Size] = [size.B(1168)] # TODO: MUSE-V3 can support more basic sizes OL1_choices_map: Dict[size.Size, TileSize] = { size.B(3): TileSize(1, 1), size.B(12): TileSize(2, 2), size.B(48): TileSize(4, 4), size.B(192): TileSize(8, 8) } # 16b-A, 8b-W Mode Configurations: # AL2_choices: List[size.Size] = [size.B(23040)] # AL1_choices: List[size.Size] = [size.B(4096)] # WL1_choices: List[size.Size] = [size.B(1168)] # TODO: MUSE-V3 can support more basic sizes
def get_energy(self): total_runtime = self.get_runtime() chiplet_communication = self.get_chiplet_communication() Energy_DRAMtoSRAM_A: float = cfg.A_BW_RATIO * self.total_memory_access.AL2_Wr.to_b() * \ (cfg.Energy_GRS + cfg.Energy_DRAM + cfg.Energy_AL2_Wr) Energy_DRAMtoSRAM_W: float = cfg.W_BW_RATIO * self.total_memory_access.WL1_Wr.to_b() * \ (cfg.Energy_GRS + cfg.Energy_DRAM + cfg.Energy_WL1_Wr) Energy_TotalMAC: float = total_runtime * cfg.Energy_MAC * cfg.DATA_WIDTH * ( self.computecase.lane * self.computecase.vector * self.computecase.chiplet * self.computecase.core) # Energy breakdown DRAM_energy: float = cfg.A_BW_RATIO * self.total_memory_access.OL2_Rd.to_b() * cfg.Energy_DRAM \ + cfg.W_BW_RATIO * self.total_memory_access.WL1_Wr.to_b() * cfg.Energy_DRAM \ + cfg.A_BW_RATIO * self.total_memory_access.AL2_Wr.to_b() * cfg.Energy_DRAM \ + cfg.A_BW_RATIO * self.total_memory_access.AL2_Wr.to_b() * cfg.Energy_GRS \ + cfg.W_BW_RATIO * self.total_memory_access.WL1_Wr.to_b() * cfg.Energy_GRS \ + cfg.A_BW_RATIO * self.total_memory_access.OL2_Rd.to_b() * cfg.Energy_GRS D2D_energy: float = cfg.A_BW_RATIO * chiplet_communication.to_b( ) * cfg.Energy_GRS A_L2_energy: float = cfg.A_BW_RATIO * self.total_memory_access.AL2_Wr.to_b() * cfg.Energy_AL2_Wr \ + cfg.A_BW_RATIO * self.total_memory_access.AL1_Rd.to_b() * cfg.Energy_AL2_Rd A_L1_energy: float = cfg.A_BW_RATIO * self.total_memory_access.AL1_Wr.to_b() * cfg.Energy_AL1_Wr \ + cfg.A_BW_RATIO * self.total_memory_access.AL1_Rd.to_b() * cfg.Energy_AL1_Rd W_L1_energy: float = cfg.W_BW_RATIO * self.total_memory_access.WL1_Wr.to_b() * cfg.Energy_WL1_Wr \ + cfg.A_BW_RATIO * self.total_memory_access.WL1_Rd.to_b() * cfg.Energy_WL1_Rd output_energy: float = cfg.A_BW_RATIO * self.total_memory_access.OL2_Rd.to_b() * cfg.Energy_OL2_Rd \ + cfg.A_BW_RATIO * self.total_memory_access.OL2_Wr.to_b() * cfg.Energy_OL2_Wr \ + 4 * self.total_memory_access.OL1_Wr.to_b() * (cfg.get_Energy_RF(size.B(384))) \ + 4 * self.total_memory_access.OL1_Rd.to_b() * (cfg.get_Energy_RF(size.B(384))) MAC_energy: float = Energy_TotalMAC total_energy = DRAM_energy + D2D_energy + A_L2_energy + A_L1_energy + W_L1_energy + output_energy + MAC_energy energy_breakdown = EnergyBreakdown(DRAM_energy, D2D_energy, A_L2_energy, A_L1_energy, W_L1_energy, output_energy, Energy_TotalMAC) return total_energy, Energy_DRAMtoSRAM_W, Energy_DRAMtoSRAM_A, energy_breakdown
def get_dram_communication(self): dram_access = self.total_memory_access.OL2_Rd.to_b( ) + self.total_memory_access.WL1_Wr.to_b( ) + self.total_memory_access.AL2_Wr.to_b() return size.B(dram_access)
def search(workload: Workload, writer: csv.DictWriter, note: str): for design in tqdm(designs, desc='Try Different Hardware Parallel Designs'): sram_cases: List[AnalysisCase] = get_analysis_cases(design, workload) for sram_case in sram_cases: evaluator = OverheadEval(sram_case, design) total_memory, total_memory_access, chiplet_communication, total_runtime, area_per_chiplet, area_per_package, \ total_energy, Energy_DRAMtoSRAM_W, Energy_DRAMtoSRAM_A, energy_breakdown = evaluator.evaluation() dram_access = evaluator.get_dram_communication() view_module_energy = 0 if view_module_energy: print("\n") print('Chip to Chip: %f' % Energy_GRS) print('DRAM to SRAM_A: %f' % (Energy_GRS + Energy_DRAM + Energy_AL2_Wr)) print('DRAM to SRAM_W: %f' % (Energy_GRS + Energy_DRAM + Energy_WL1_Wr)) print('AL2 to AL1: %f' % (Energy_AL2_Rd + Energy_AL1_Wr)) print('AL1 to MAC: %f' % (Energy_WL1_Rd)) print('WL1 to MAC: %f' % (Energy_AL1_Rd)) print('MAC to OL1(RF): %f' % (get_Energy_RF(size.B(384)))) print('OL1(RF) to OL2: %f' % (Energy_OL2_Wr + get_Energy_RF(sram_case.OL1))) print('OL2 to DRAM: %f' % (Energy_GRS + Energy_DRAM + Energy_OL2_Rd)) print('AL1 Size: %s' % str(sram_case.AL1)) print('AL2 Size: %s' % str(sram_case.AL2)) print('WL1 Size: %s' % str(sram_case.WL1)) print('Area of 1 Chiplet: %s' % str(area_per_chiplet / 1000000)) sys.exit() writer.writerow({ 'Chiplet': design.chiplet, 'Core': design.core, 'Lane': design.lane, 'Vector_Size': design.vector, 'cin': workload.in_channel, 'cout': workload.out_channel, 'out_size': workload.out_size.H, 'kernel_size': workload.kernel_size.H, 'stride': workload.stride.H, 'OL1': sram_case.OL1.to_b(), 'AL1': sram_case.AL1.to_b(), 'WL1': sram_case.WL1.to_b(), 'real_WL1': evaluator.real_WL1.to_b(), 'AL2': sram_case.AL2.to_b(), 'reorder_case': sram_case.reorderCase.type_n, 'rotation_enable': sram_case.loopParameter.rotation_enable, 'runtime': total_runtime, 'area-chiplet': area_per_chiplet, 'area-package': area_per_package, 'total_memory_footprint': total_memory, 'total_energy': total_energy, 'chiplet_communication': chiplet_communication.to_b(), 'dram_communication': dram_access.to_b(), 'DRAM_energy_A': Energy_DRAMtoSRAM_A, 'DRAM_energy_W': Energy_DRAMtoSRAM_W, 'DRAM_Energy': energy_breakdown.DRAM_energy, 'Die-to-Die_Energy': energy_breakdown.D2D_energy, 'A-L2_Energy': energy_breakdown.A_L2_energy, 'A-L1_Energy': energy_breakdown.A_L1_energy, 'W-L1_Energy': energy_breakdown.W_L1_energy, 'Output_Energy': energy_breakdown.output_energy, 'MAC_Energy': energy_breakdown.Energy_TotalMAC, 'MOL1': total_memory_access.OL1_Wr.to_b(), 'MAL1': total_memory_access.AL1_Wr.to_b(), 'MWL1': total_memory_access.WL1_Wr.to_b(), 'sMWL1': evaluator.get_sram_access().WL1_Wr, 'MAL2': total_memory_access.AL2_Wr.to_b(), 'X1': sram_case.loopParameter.W1, 'Y1': sram_case.loopParameter.H1, 'K1': sram_case.loopParameter.K1, 'X2': sram_case.loopParameter.W2, 'Y2': sram_case.loopParameter.H2, 'K2': sram_case.loopParameter.K2, 'Kp': sram_case.loopParameter.package_spatial_parameter.Kp, 'Yp': sram_case.loopParameter.package_spatial_parameter.Hp, 'Kc': sram_case.loopParameter.chiplet_spatial_parameter.Kc, 'Yc': sram_case.loopParameter.chiplet_spatial_parameter.Hc, 'Xc': sram_case.loopParameter.chiplet_spatial_parameter.Wc, 'C1': sram_case.loopParameter.C1, 'C0': sram_case.loopParameter.C0, 'Csa': sram_case.loopParameter.Csa, # 'Ksw': sram_case.loopParameter.Ksw, 'X0': OL1_choices_map[sram_case.OL1].W, 'Y0': OL1_choices_map[sram_case.OL1].H, 'note': note })
def get_sram_access(self) -> MemoryAccess: Fw, Fh = self.memcase.workload.kernel_size.get_params() Csa = self.memcase.loopParameter.get_rotation_count() W0, H0 = cfg.OL1_choices_map[self.memcase.OL1].get_params() C0, C1, W1, W2, H1, H2, K1, K2 = self.memcase.loopParameter.get_temporal_count( ) _, _, _, Wc, Hc = self.memcase.loopParameter.get_spatial_count() c3p_info, Cc_fake = self.c3p_analysis() # Memory Write for one WL1 WL1_Wr = size.B(Fw * Fh * C0 * C1 * Csa * K1 * K2) # Min access count WL1_penalty = self.get_penalty(self.memcase.WL1, c3p_info['WL1'], 1) # Get the penalty item WL1_Wr = WL1_Wr * WL1_penalty # The real access count WL1_Rd = size.B(Fw * Fh * K1 * K2 * C0 * C1 * Csa * W1 * H1 * W2 * H2) # Weight-stationary # OL1 & OL2; the '4' in OL1 means that the bit-width of partial sums is 32bit OL1_Rd = size.B(4 * W0 * H0 * Fw * Fh * C1 * Csa * W1 * H1 * W2 * H2 * K2 * K1 - 1) \ + size.B(4 * W0 * H0 * W1 * H1 * W2 * H2 * K2 * K1) # MAC Read + Read to OL2. OL1_Wr = size.B(4 * W0 * H0 * Fw * Fh * C1 * Csa * W1 * H1 * W2 * H2 * K2 * K1) # MAC update OL2_Wr = size.B(W0 * H0 * W1 * H1 * W2 * H2 * K1 * K2 * Wc * Hc) # Read from OL1 and write to OL2 OL2_Rd = size.B(W0 * H0 * W1 * H1 * W2 * H2 * K1 * K2 * Wc * Hc) # Read from OL2 and write to DDR # AL1 (just like a pipeline register, reuse for a basic tile of H0 * W0) if self.memcase.AL1 < c3p_info['AL1']['Critical_Capacity'][0]: raise ValueError( 'The capacity should larger than Critical-Capacity-0' ) # See the Error description basic_tile_in = TileSize(W0, H0).in_tile(self.memcase.workload.kernel_size, self.memcase.workload.stride) H0_in = basic_tile_in.H W0_in = basic_tile_in.W AL1_Wr = size.B(W0_in * H0_in * c3p_info['AL1']['Penalty'][1] * C0 * C1 * Csa) # all the temporal loop counts are penalty AL1_Rd = size.B(C0 * Csa * C1 * K1 * K2 * W0_in * H0_in * Fw * Fh * W1 * W2 * H1 * H2) # AL2 W0W1Wc_H0H1Hc_in_tile_size = cfg.TileSize( H0 * H1 * Hc, W0 * W1 * Wc).in_tile(self.memcase.workload.kernel_size, self.memcase.workload.stride).size() W0W1W2Wc_H0H1H2Hc_in_tile_size = cfg.TileSize( H0 * H1 * H2 * Hc, W0 * W1 * W2 * Wc).in_tile(self.memcase.workload.kernel_size, self.memcase.workload.stride).size() AL2_Wr = W0W1W2Wc_H0H1H2Hc_in_tile_size if self.memcase.AL2 < Cc_fake['AL2'][ 2]: # There are some cases that AL2 can buffer the whole chiplet workload AL2_Wr = W0W1Wc_H0H1Hc_in_tile_size * W2 * H2 # If cannot buffer, each tile size is "W0W1Wc_H0H1Hc_in_tile_size" AL2_penalty = self.get_penalty(self.memcase.AL2, c3p_info['AL2'], 0) AL2_Wr = size.B(AL2_Wr * AL2_penalty * C0 * C1) AL2_Rd = AL1_Wr * Hc * Wc # Broadcast to Kc cores (input reuse) return MemoryAccess(WL1_Wr, WL1_Rd, OL1_Wr, OL1_Rd, AL1_Wr, AL1_Rd, AL2_Wr, AL2_Rd, OL2_Wr, OL2_Rd)
def get_c3p_info(self): workloadForLoopDescription = self.case.reorderCase.getreorder() basic_workload = OL1_choices_map[self.case.OL1] W0, H0 = basic_workload.get_params() W0_in, H0_in = basic_workload.in_tile( self.case.workload.kernel_size, self.case.workload.stride).get_params() C0, C1, W1, W2, H1, H2, K1, K2 = self.case.loopParameter.get_temporal_count( ) _, _, _, Wc, Hc = self.case.loopParameter.get_spatial_count() Csa = self.case.loopParameter.get_rotation_count() self.c3p_info = {} self.Cc_fake = {} ######################################################## WL1 ######################################################## Fx, Fy = self.case.workload.kernel_size.get_params() # The critical point is None if it doesn't make sense ''' WL1 corresponds to one lane. Cc0: Once basic workload (for most workload, our MUSE-v3 can satisfy it) Cc1: Buffer all input channels (some extreme cases maybe fail to satisfy it and then should consider ping-pong in compiler) Cc2: Buffer all mini-tiles with all input channels. In fact, in the generate_case.py, if WL1 >= Cc1, and then can satisfy the Cc2 Cc3: Buffer all weights ''' self.WL1_Cc0: size.Size = size.B( Fx * Fy * C0) # At least to support once mapping of C0 kernels for a lane self.WL1_Cc1: Optional[size.Size] = None self.WL1_Cc2: Optional[size.Size] = None self.WL1_Cc3: Optional[size.Size] = None self.WL1_Cc0_Penalty: int = 1 self.WL1_Cc1_Penalty: int = 1 self.WL1_Cc2_Penalty: int = 1 self.WL1_Cc3_Penalty: int = 1 WL1_Cc2_Penalty_start = 0 WL1_Cc3_Penalty_start = 0 WL1_info = {} # WL1: Cc1 self.WL1_Cc1 = self.WL1_Cc0 * C1 * Csa # Calculate WL1: Cc1_Penalty for forLoopSymbol in reversed(workloadForLoopDescription): if forLoopSymbol in WImpactFactors: break else: self.WL1_Cc1_Penalty = self.WL1_Cc1_Penalty * self.case.loopParameter.symbol_to_count( forLoopSymbol) # Check WL1: Cc2 K1Index = workloadForLoopDescription.index(ForLoopSymbol.K1) if K1Index != 0: for forLoopSymbol in reversed( workloadForLoopDescription[:K1Index]): if self.case.loopParameter.symbol_to_count(forLoopSymbol) != 1: # Find the first real loop WL1_Cc2_Penalty_start = workloadForLoopDescription.index( forLoopSymbol) + 1 if forLoopSymbol in WImpactFactors: # Cc2 doesn't make sense pass else: self.WL1_Cc2 = self.WL1_Cc0 * C1 * Csa * K1 break # Calculate WL1: Cc2_Penalty WL1_Cc2_Penalty_start = workloadForLoopDescription.index( forLoopSymbol.K1) if WL1_Cc2_Penalty_start > 0: for forLoopSymbol in reversed( workloadForLoopDescription[:WL1_Cc2_Penalty_start]): if forLoopSymbol in WImpactFactors: break else: self.WL1_Cc2_Penalty = self.WL1_Cc2_Penalty * self.case.loopParameter.symbol_to_count( forLoopSymbol) # Check WL1: Cc3 K2Index = workloadForLoopDescription.index(ForLoopSymbol.K2) if K2Index != 0: for forLoopSymbol in reversed( workloadForLoopDescription[:K2Index]): if self.case.loopParameter.symbol_to_count(forLoopSymbol) != 1: # Find the first real loop WL1_Cc3_Penalty_start = workloadForLoopDescription.index( forLoopSymbol) + 1 if forLoopSymbol in WImpactFactors: # Cc3 doesn't make sense pass else: self.WL1_Cc3 = self.WL1_Cc0 * C1 * Csa * K1 * K2 break # Calculate WL1: Cc3_Penalty WL1_Cc3_Penalty_start = workloadForLoopDescription.index( forLoopSymbol.K2) if WL1_Cc3_Penalty_start > 0: for forLoopSymbol in reversed( workloadForLoopDescription[:WL1_Cc3_Penalty_start]): if forLoopSymbol in WImpactFactors: break else: self.WL1_Cc3_Penalty = self.WL1_Cc3_Penalty * self.case.loopParameter.symbol_to_count( forLoopSymbol) WL1_info['Critical_Capacity'] = [ self.WL1_Cc0, self.WL1_Cc1, self.WL1_Cc2, self.WL1_Cc3 ] WL1_info['Penalty'] = [ self.WL1_Cc0_Penalty, self.WL1_Cc1_Penalty, self.WL1_Cc2_Penalty, self.WL1_Cc3_Penalty ] self.c3p_info['WL1'] = WL1_info ##################################################################################################################### ######################################################## AL1 ######################################################## # if (self.case.loopParameter.K1 == 1 and self.case.loopParameter.H1 == 1 and self.case.loopParameter.W1 == 1 \ # and self.case.loopParameter.K2 == 2 and self.case.loopParameter.H2 == 8 and self.case.loopParameter.W2 == 8 \ # and self.case.workload.stride.H == 2 and self.case.loopParameter.C1 == 16 and self.case.workload.kernel_size.H == 1): # print(1) # debug ''' In our design, the AL1 is just like a vector-register (very small) in the pipeline. Therefore, the Penalty is the number of Basic-Tiles (HO * WO * CO) ''' self.AL1_Cc0: size.Size = size.B(W0_in * H0_in * C0) self.AL1_Cc0_Penalty: int = 1 self.AL1_Penalty: int = 1 AL1_info = {} for forLoopSymbol in reversed( workloadForLoopDescription): # All the outer loops are penalty self.AL1_Penalty = self.AL1_Penalty * self.case.loopParameter.symbol_to_count( forLoopSymbol) AL1_info['Critical_Capacity'] = [self.AL1_Cc0] AL1_info['Penalty'] = [self.AL1_Cc0_Penalty, self.AL1_Penalty] self.c3p_info['AL1'] = AL1_info ##################################################################################################################### ######################################################## AL2 ######################################################## tileSize = TileSize(H0 * Hc, W0 * Wc) tileSize_in = tileSize.in_tile(self.case.workload.kernel_size, self.case.workload.stride) W_Cc2 = W0 * W1 * Wc H_Cc2 = H0 * H1 * Hc W_Cc3 = W0 * W1 * Wc * W2 H_Cc3 = H0 * H1 * Hc * H2 ''' Cc0: At least once mapping for all cores Cc1: Can buffer all input channels Cc2: Can buffer all mini-tile Cc3: Can buffer all tiles (e.g., some layers have very the small feature map size) But in fact, generate_case.py has guaranteed AL2 >= Cc2. The following functions prevent some exception cases that I have ignored. ''' self.AL2_Cc0: size.Size = size.B(tileSize_in.size() * C0) self.AL2_Cc1_fake: size.Size = self.AL2_Cc0 * C1 self.AL2_Cc2_fake = size.B( TileSize(H_Cc2, W_Cc2).in_tile(self.case.workload.kernel_size, self.case.workload.stride).size() * C0 * C1) self.AL2_Cc3_fake = size.B( TileSize(H_Cc3, W_Cc3).in_tile(self.case.workload.kernel_size, self.case.workload.stride).size() * C0 * C1) # The critical point is None if it doesn't make sense self.AL2_Cc1: Optional[size.Size] = None self.AL2_Cc2: Optional[size.Size] = None self.AL2_Cc3: Optional[size.Size] = None self.AL2_Cc0_Penalty: int = 1 self.AL2_Cc1_Penalty: int = 1 self.AL2_Cc2_Penalty: int = 1 self.AL2_Cc3_Penalty: int = 1 AL2_Cc2_Penalty_start = 0 AL2_Cc3_Penalty_start = 0 AL2_info = {} # Check AL2: Cc1 (Check whether critical points are meaningful) for forLoopSymbol in reversed(workloadForLoopDescription): if self.case.loopParameter.symbol_to_count(forLoopSymbol) != 1: # Find the first real loop if forLoopSymbol in IAImpactFactors: # Cc1 doesn't make sense pass else: self.AL2_Cc1 = self.AL2_Cc1_fake break for forLoopSymbol in reversed(workloadForLoopDescription): if forLoopSymbol in IAImpactFactors: break else: self.AL2_Cc1_Penalty = self.AL2_Cc1_Penalty * self.case.loopParameter.symbol_to_count( forLoopSymbol) # Check AL2: Cc2 H1Index = workloadForLoopDescription.index(ForLoopSymbol.H1) if H1Index != 0: for forLoopSymbol in reversed( workloadForLoopDescription[:H1Index]): if self.case.loopParameter.symbol_to_count(forLoopSymbol) != 1: # Find the first real loop if forLoopSymbol in IAImpactFactors: # Cc2 doesn't make sense pass else: self.AL2_Cc2 = self.AL2_Cc2_fake break AL2_Cc2_Penalty_start = workloadForLoopDescription.index( forLoopSymbol.H1) if AL2_Cc2_Penalty_start > 0: for forLoopSymbol in reversed( workloadForLoopDescription[:AL2_Cc2_Penalty_start]): if forLoopSymbol in IAImpactFactors: break else: self.AL2_Cc2_Penalty = self.AL2_Cc2_Penalty * self.case.loopParameter.symbol_to_count( forLoopSymbol) # Check AL2: T3 H2Index = workloadForLoopDescription.index(ForLoopSymbol.H2) if H2Index != 0: for forLoopSymbol in reversed( workloadForLoopDescription[:H2Index]): if self.case.loopParameter.symbol_to_count(forLoopSymbol) != 1: # Find the first real loop if forLoopSymbol in IAImpactFactors: # T3_n doesn't make sense pass else: self.AL2_Cc3 = self.AL2_Cc3_fake break AL2_Cc3_Penalty_start = workloadForLoopDescription.index( forLoopSymbol.H2) if AL2_Cc3_Penalty_start > 0: for forLoopSymbol in reversed( workloadForLoopDescription[:AL2_Cc3_Penalty_start]): if forLoopSymbol in IAImpactFactors: break else: self.AL2_Cc3_Penalty = self.AL2_Cc3_Penalty * self.case.loopParameter.symbol_to_count( forLoopSymbol) AL2_info['Critical_Capacity'] = [ self.AL2_Cc0, self.AL2_Cc1, self.AL2_Cc2, self.AL2_Cc3 ] AL2_info['Penalty'] = [ self.AL2_Cc0_Penalty, self.AL2_Cc1_Penalty, self.AL2_Cc2_Penalty, self.AL2_Cc3_Penalty ] self.c3p_info['AL2'] = AL2_info self.Cc_fake['AL2'] = [ self.AL2_Cc1_fake, self.AL2_Cc2_fake, self.AL2_Cc3_fake ] ##################################################################################################################### return self.c3p_info, self.Cc_fake
def refresh_conf(actbit: float, wetbit: float): global num_cores global num_lanes global W_BW_RATIO global A_BW_RATIO global DATA_WIDTH global AL2_choices global AL1_choices global WL1_choices global WEIGHT_WIDTH global num_chiplets global size_vectors global OL1_choices_map global ACT_MEMORY_ALIGN DATA_WIDTH = actbit WEIGHT_WIDTH = wetbit A_BW_RATIO = DATA_WIDTH / 8 W_BW_RATIO = WEIGHT_WIDTH / 8 if DATA_WIDTH == 16: ACT_MEMORY_ALIGN = 8 elif DATA_WIDTH == 8 or DATA_WIDTH == 4: ACT_MEMORY_ALIGN = 16 else: raise ValueError('The DATA-WIDTH only supports 4, 8, and 16bit') if DATA_WIDTH == 16 and WEIGHT_WIDTH == 8: num_chiplets = [4] num_cores = [8] num_lanes = [8] size_vectors = [8] # 16b-A, 8b-W Mode Configurations: AL2_choices = [size.B(23040)] AL1_choices = [size.B(4096)] WL1_choices = [size.B(1168)] # TODO: MUSE-V3 can support more basic sizes OL1_choices_map = { size.B(3): TileSize(1, 1), size.B(12): TileSize(2, 2), size.B(48): TileSize(4, 4), size.B(192): TileSize(8, 8) } elif DATA_WIDTH == 8 and WEIGHT_WIDTH == 8: num_chiplets = [4] num_cores = [8] num_lanes = [16] size_vectors = [8] # 8b-A, 8b-W Mode Configurations: AL2_choices = [size.B(46080)] AL1_choices = [size.B(8192)] WL1_choices = [size.B(1168)] OL1_choices_map = { size.B(3): TileSize(1, 1), size.B(12): TileSize(2, 2), size.B(48): TileSize(4, 4), size.B(192): TileSize(8, 8) } elif DATA_WIDTH == 8 and WEIGHT_WIDTH == 4: num_chiplets = [4] num_cores = [8] num_lanes = [16] size_vectors = [8] # 8b-A, 4b-W Mode Configurations: AL2_choices = [size.B(46080)] AL1_choices = [size.B(8192)] WL1_choices = [size.B(2336)] OL1_choices_map = { size.B(3): TileSize(1, 1), size.B(12): TileSize(2, 2), size.B(48): TileSize(4, 4), size.B(192): TileSize(8, 8) } elif DATA_WIDTH == 4 and WEIGHT_WIDTH == 4: num_chiplets = [4] num_cores = [8] num_lanes = [16] size_vectors = [16] # 4b-A, 4b-W Mode Configurations: AL2_choices = [size.B(92160)] AL1_choices = [size.B(16384)] WL1_choices = [size.B(2336)] OL1_choices_map = { size.B(3): TileSize(1, 1), size.B(12): TileSize(2, 2), size.B(48): TileSize(4, 4), size.B(192): TileSize(8, 8) }