def estimate_resources(self, N, M, app_settings, sync_buff_total_samps, pre_filt_latency): rscrs = rfnocsim.HwRsrcs() DSP_BLOCKS_PER_MAC = 3 # DSP blocks for a scaled complex MAC MAX_DSP_RATE = 400e6 # Max clock rate for a DSP48E block MAX_UNROLL_DEPTH = 2 # How many taps (or FFT bins) to compute in parallel? COEFF_SETS = 1 # We need two copies of coefficients one live # and one buffered for dynamic reload. If both # live in BRAM, this should be 2. If the live # set lives in registers, this should be 1 samp_rate = float(app_settings['samp_rate']) dsp_cyc_per_samp = MAX_DSP_RATE / samp_rate if app_settings['domain'] == 'time': fir_taps = app_settings['fir_taps'] if (fir_taps <= dsp_cyc_per_samp): unroll_factor = 1 dsp_rate = samp_rate * fir_taps else: unroll_factor = math.ceil((1.0 * fir_taps) / dsp_cyc_per_samp) dsp_rate = MAX_DSP_RATE if (unroll_factor > MAX_UNROLL_DEPTH): raise self.SimCompError( 'Too many FIR coefficients! Reached loop unroll limit.' ) rscrs.add('DSP', DSP_BLOCKS_PER_MAC * unroll_factor * N * M) rscrs.add('BRAM_18kb', math.ceil(ColGlobals.BPI * app_settings['fir_dly_line'] / hw.Bee7Fpga.BRAM_BYTES) * N * M) # FIR delay line memory rscrs.add('BRAM_18kb', math.ceil(ColGlobals.BPI * COEFF_SETS * fir_taps * unroll_factor * N * M / hw.Bee7Fpga.BRAM_BYTES)) # Coefficient storage samp_per_tick = dsp_rate / self.get_tick_rate() self.update_latency(func=pre_filt_latency + (fir_taps / (samp_per_tick * unroll_factor))) else: fft_size = app_settings['fft_size'] rscrs.add('DSP', DSP_BLOCKS_PER_MAC * N * M * MAX_UNROLL_DEPTH) # MACs rscrs.add( 'BRAM_18kb', math.ceil(ColGlobals.BPI * N * M * fft_size * COEFF_SETS / hw.Bee7Fpga.BRAM_BYTES)) # Coeff storage samp_per_tick = MAX_DSP_RATE / self.get_tick_rate() self.update_latency(func=pre_filt_latency + (fft_size / samp_per_tick)) rscrs.add( 'BRAM_18kb', math.ceil(ColGlobals.BPI * sync_buff_total_samps / hw.Bee7Fpga.BRAM_BYTES)) self.update_rsrcs(rscrs)
def estimate_resources(self, radix, sync_buff_depth): rscrs = rfnocsim.HwRsrcs() # Assume that pipelined adders are inferred in logic (not DSP) # Assume that buffering uses BRAM rscrs.add( 'BRAM_18kb', math.ceil(ColGlobals.BPI * sync_buff_depth * radix / hw.Bee7Fpga.BRAM_BYTES)) self.update_rsrcs(rscrs)
def __init__(self, sim_core, name): self.sim_core = sim_core rfnocsim.SimComp.__init__(self, sim_core, name, rfnocsim.comptype.hardware) # Max resources from Virtex7 datasheet self.max_resources = rfnocsim.HwRsrcs() self.max_resources.add('DSP', 3600) self.max_resources.add('BRAM_18kb', 2940) self.resources = rfnocsim.HwRsrcs() # Each FPGA has 80 SERDES lanes self.max_io = 80 self.serdes_i = dict() self.serdes_o = dict() # Each lane can carry at most 10GB/s # Each SERDES needs to have some buffering. We assume elastic buffering (50% full on avg). io_buff_size = (self.IO_LN_BW * self.IO_LN_LATENCY) / self.ELASTIC_BUFF_FULLNESS # Worst case lane latency lane_latency = self.IO_LN_LATENCY * self.get_tick_rate() for i in range(self.max_io): self.serdes_i[i] = rfnocsim.Channel(sim_core, self.__ioln_name(i) + '/I', self.IO_LN_BW, lane_latency / 2) self.serdes_o[i] = rfnocsim.Channel(sim_core, self.__ioln_name(i) + '/O', self.IO_LN_BW, lane_latency / 2) self.resources.add('BRAM_18kb', 1 + math.ceil( io_buff_size / self.BRAM_BYTES)) #input buffering per lane self.resources.add('BRAM_18kb', 1) #output buffering per lane # Other resources self.resources.add('BRAM_18kb', 72) # BPS infrastructure + microblaze self.resources.add('BRAM_18kb', 128) # 2 MIGs self.functions = dict()