def test_prod(self): ''' Check prod. ''' for fs in util.factorize(24, 3): self.assertEqual(util.prod(fs), 24) for fs in util.factorize(1024, 3): self.assertEqual(util.prod(fs), 1024)
def _make_data_layout(self, nfm, hfm, wfm, origin, nums, dims): ''' Make a DataLayout instance. ''' assert util.prod(nums) == dims.size() def _coord(idxs): # In the order of n, b, w, h, i.e., 1, 0, 3, 2. cflat = 0 for i in [1, 0, 3, 2]: cflat = cflat * nums[i] + idxs[i] assert cflat < dims.size() return PhyDim2(*divmod(cflat, dims.w)) sizes = (self.batch_size, nfm, hfm, wfm) frmap = FmapRangeMap() for idxs in itertools.product(*[range(n) for n in nums]): begs = [i * s // n for i, n, s in zip(idxs, nums, sizes)] ends = [(i + 1) * s // n for i, n, s in zip(idxs, nums, sizes)] frmap.add(FmapRange(begs, ends), (_coord(idxs), )) dl = DataLayout(frmap=frmap, origin=origin, type=NodeRegion.DATA) assert dl.frmap.complete_fmap_range().size() == util.prod(sizes) return dl
def test_scheme_dict(self): ''' get_scheme_dict. ''' for bl_ts, bl_ords in self._gen_loopblocking_all(): lbs = self._lbs(bl_ts, bl_ords, part_occ=self.part_occ) if not lbs.is_valid(): self.assertIsNone(lbs.get_scheme_dict(self.cost)) continue sdict = lbs.get_scheme_dict(self.cost) self.assertAlmostEqual(sdict['cost'], lbs.get_cost(self.cost)) self.assertAlmostEqual(sdict['ops'], lbs.ops) self.assertAlmostEqual(sdict['time'], lbs.time) self.assertEqual(id(sdict['access']), id(lbs.get_access())) for lvl in [0, 1]: for dce in range(de.NUM): self.assertAlmostEqual(sdict['size'][lvl][dce], lbs.data_size(lvl, dce)) self.assertAlmostEqual(sdict['part_occ'], self.part_occ) self.assertEqual(util.prod(sdict['ti']), self.nld['BASE'].loopcnt[le.IFM]) self.assertEqual(util.prod(sdict['to']), self.nld['BASE'].loopcnt[le.OFM]) self.assertEqual(util.prod(sdict['tb']), self.nld['BASE'].loopcnt[le.BAT])
def do_access(self, idx_pr, cnt_pr, read=1, write=0): ''' Access the buffer by `read` and/or `write`, with the unit index `idx_pr` and count `cnt_pr`, of all dimensions. Return the count of the accessing data to the next level, of all dimensions. ''' if self.bypass: # Bypass, relay to the next level. return cnt_pr # Range index. ridx_pr = self._range_idx_pr(idx_pr) # Access. self.access += util.prod(cnt_pr) * (read + write) if ridx_pr == self.data: # Hit. return (0, 0) # Miss. self.data = ridx_pr return self.buf_cnt_pr
def _make_bl_ts(self, ti_part, to_part, tb_part, wlkey='BASE'): ''' Make a set of blocking factors. `ti_part`, `to_part`, `tb_part` can contain one 0 value to be filled. ''' try: idx = ti_part.index(0) except ValueError: ti = ti_part else: ti = [ ti_part[x] if x != idx else util.idivc( self.nld[wlkey].loopcnt[le.IFM], util.prod(ti_part[:idx] + ti_part[idx + 1:])) for x in range(3) ] try: idx = to_part.index(0) except ValueError: to = to_part else: to = [ to_part[x] if x != idx else util.idivc( self.nld[wlkey].loopcnt[le.OFM], util.prod(to_part[:idx] + to_part[idx + 1:])) for x in range(3) ] try: idx = tb_part.index(0) except ValueError: tb = tb_part else: tb = [ tb_part[x] if x != idx else util.idivc( self.nld[wlkey].loopcnt[le.BAT], util.prod(tb_part[:idx] + tb_part[idx + 1:])) for x in range(3) ] lp_ts = [None] * le.NUM lp_ts[le.IFM] = ti lp_ts[le.OFM] = to lp_ts[le.BAT] = tb return tuple(zip(*lp_ts))
def test_int(self): ''' Int. ''' self.assertIsInstance(util.prod([3, 5, 7]), int) self.assertEqual(util.prod([3, 5, 7]), 105) self.assertEqual(util.prod([3, 5, -1]), -15) self.assertEqual(util.prod([3, -5, 7]), -105) self.assertEqual(util.prod([3, -5, 0]), 0) self.assertEqual(util.prod((3, 5, 7)), 105) self.assertEqual(util.prod(set([3, 5, 7])), 105) self.assertEqual(util.prod({3: 'a', 5: 'b', 7: 'c'}), 105)
def _init_sub_range(self, lp_t_list, dim_loops): assert len(dim_loops) == 2 subrng_list = [(0, 0)] subrng_sz_pr = [1, 1] # From inner to outer. for lpe, t in reversed(lp_t_list): # The data dimension index of this loop. try: d = dim_loops.index(lpe) except ValueError: # This loop is not related to the data, skip. assert lpe not in dim_loops continue # Size of this dimension of current loop body, i.e., all inner # loops. s = subrng_sz_pr[d] # Make the new subrange list, by looping over the current loop # body with the current loop factor, and updating this # dimension. new_subrng_list = [] for i in range(t): new_subrng_list += [ tuple(i_ + i * s if d_ == d else i_ for d_, i_ in enumerate(sr)) for sr in subrng_list ] subrng_list = new_subrng_list # Update size of this dimension. subrng_sz_pr[d] *= t # Check. assert len(set(subrng_list)) == len(subrng_list) assert len(subrng_list) == util.prod(subrng_sz_pr) subrng_cnt_pr = tuple( buf_cnt // subrng_sz for buf_cnt, subrng_sz in zip(self.buf_cnt_pr, subrng_sz_pr)) return subrng_list, subrng_cnt_pr
def _sim_access_conv(self, lbs, get_bufshr=False): ''' Get data access by actually simulating and generating loops for CONV layer. If `get_bufshr` is True, also return bufshr stats. ''' self.assertTrue(lbs.is_valid(), '_sim_access_conv: invalid lbs.') data_loops = lbs.nld.data_loops lpts = tuple(zip(*lbs.bl_ts)) subgrp_size, rot_unit_cnt, lp_t_list = self._bufshr_params(lbs) data_loops = lbs.nld.data_loops # Get buffered unit counts at each level. dram_buf_cnt_pr_list = [ tuple(util.prod(lpts[lpe]) for lpe in data_loops[dce].loops()) for dce in range(de.NUM) ] gbuf_buf_cnt_pr_list = [ tuple(util.prod(lpts[lpe][1:]) for lpe in data_loops[dce].loops()) for dce in range(de.NUM) ] regf_buf_cnt_pr_list = [ tuple(util.prod(lpts[lpe][2:]) for lpe in data_loops[dce].loops()) for dce in range(de.NUM) ] # Initialize SimBuffer. drams = [None] * de.NUM for dce, buf_cnt_pr in enumerate(dram_buf_cnt_pr_list): drams[dce] = self._SimBuffer( dce, buf_cnt_pr, lbs.nld.unit_access[me.DRAM][dce] if lbs.stored_in_gbuf[dce] else lbs.nld.unit_access[me.GBUF][dce], ) gbufs = [None] * de.NUM for dce, buf_cnt_pr in enumerate(gbuf_buf_cnt_pr_list): gbufs[dce] = self._SimBufferSharing( dce, buf_cnt_pr, lbs.nld.unit_access[me.GBUF][dce], subgrp_size[dce], rot_unit_cnt[dce], lp_t_list, data_loops[dce].loops(), bypass=(not lbs.stored_in_gbuf[dce])) regfs = [None] * de.NUM for dce, buf_cnt_pr in enumerate(regf_buf_cnt_pr_list): regfs[dce] = self._SimBuffer( dce, buf_cnt_pr, lbs.nld.unit_access[me.REGF][dce], ) # Already generated psum for OFM. ofm_psum = set() # Simulation. for idx_tuple in lbs.gen_index(): for dce in range(de.NUM): idx_pr = tuple(data_loops[dce].take(idx_tuple)) if dce == de.OFM: # Fetch and writeback, unless for the first time (no fetch). write = 1 read = 1 if idx_pr in ofm_psum else 0 ofm_psum.add(idx_pr) else: read = 1 write = 0 # PE. cnt_pr = (1, 1) # REGF. cnt_pr = regfs[dce].do_access(idx_pr, cnt_pr, read, write) if not any(cnt_pr): continue # GBUF. cnt_pr = gbufs[dce].do_access(idx_pr, cnt_pr, read, write) if not any(cnt_pr): continue # DRAM. cnt_pr = drams[dce].do_access(idx_pr, cnt_pr, read, write) if not any(cnt_pr): continue dram_access = [drams[dce].access_size() for dce in range(de.NUM)] gbuf_access = [gbufs[dce].access_size() for dce in range(de.NUM)] # Sum over all nodes. dram_access = [ a * lbs.num_nodes // r for a, r in zip(dram_access, lbs.accfwd_reduction) ] gbuf_access = [a * lbs.num_nodes for a in gbuf_access] # Buffer sharing. if get_bufshr: rotation_access = [ gbufs[dce].rotation_access_size() * (lbs.num_nodes // subgrp_size[dce]) for dce in range(de.NUM) ] wide_fetch_access = [ gbufs[dce].wide_fetch_access_size() * (lbs.num_nodes // subgrp_size[dce]) for dce in range(de.NUM) ] rotation_rounds = [ gbufs[dce].rotation_rounds() for dce in range(de.NUM) ] return dram_access, gbuf_access, \ (rotation_access, wide_fetch_access, rotation_rounds) for dce in range(de.NUM): self.assertAlmostEqual(gbufs[dce].rotation_access_size(), 0, msg='_sim_access_conv: non-0 ' 'rotation access with no bufshr.') self.assertAlmostEqual(gbufs[dce].wide_fetch_access_size(), 0, msg='_sim_access_conv: non-0 ' 'wide fetch access with no bufshr.') self.assertEqual(gbufs[dce].rotation_rounds(), 0, msg='_sim_access_conv: non-0 ' 'rotation rounds with no bufshr.') return dram_access, gbuf_access
def do_access(self, idx_pr, cnt_pr, read=1, write=0): ret = self.base.do_access(idx_pr, cnt_pr, read=read, write=write) if self.bypass: # Bypass, skip buffer sharing. return ret # Range index. ridx_pr = self._range_idx_pr(idx_pr) if any(ret): # Miss in the shared buffer and load new range. Reset. self.cur_rot_unit = 0 self.rot_step_cnt.setdefault(ridx_pr, 0) if self.cur_rot_step_cnt == 0: # Initial fetch, no replaced data yet. assert self.rot_rnd_cnt_per_load is None else: rot_rnd_cnt_per_load, rem_ = divmod( self.cur_rot_step_cnt, self.rot_steps_per_round) assert rem_ == 0 assert self.rot_rnd_cnt_per_load is None \ or self.rot_rnd_cnt_per_load == rot_rnd_cnt_per_load self.rot_rnd_cnt_per_load = rot_rnd_cnt_per_load self.cur_rot_step_cnt = 0 assert all(cnt <= subrng_cnt for cnt, subrng_cnt in zip(cnt_pr, self.subrng_cnt_pr)) # Subrange index. sridx_pr = self._subrange_idx_pr(idx_pr) # Rotation unit index. ru_idx = self._subrng_rot_unit_idx(sridx_pr) if ru_idx != self.cur_rot_unit: # Move to next rotation unit. if (self.cur_rot_unit + 1) * self.rot_unit_size \ >= self.subrng_num: # The current rotation unit is the last one. Start a new # rotation round. # Do not rotate back to the initial state. Instead start # from the current state. self.cur_rot_unit = 0 self.last_wf_subrng_idx = 0 self.seq_wf_acc = 0 elif self.cur_rot_unit * self.rot_unit_size \ + self.buf_subrng_num >= self.subrng_num: # The last rotation unit is already local. No more rotation. self.cur_rot_unit += 1 else: # Rotate by one rotation unit, but not exceeding the end. offset = min( self.rot_unit_size, self.subrng_num - self.cur_rot_unit * self.rot_unit_size - self.buf_subrng_num) assert offset > 0 # All subranges shift by the above offset. acc_ = (1. * offset / self.buf_subrng_num) * self.subrng_num self.rot_access += util.prod(self.subrng_cnt_pr) * acc_ self.cur_rot_unit += 1 # One rotation step. self.rot_step_cnt[ridx_pr] += 1 self.cur_rot_step_cnt += 1 # Combine wide fetch with rotation. self.wf_access -= self.seq_wf_acc self.saved_wf_access += self.seq_wf_acc self.seq_wf_acc = 0 assert ru_idx == self.cur_rot_unit # Buffer index of which has this subrange. buf_idx = self._subrng_buf_idx(sridx_pr) # Wide fetch from possibly remote buffer. wf_acc = util.prod(cnt_pr) * (read + write) * buf_idx self.wf_access += wf_acc # Record amount of sequential wide fetch. subrng_idx = self.subrng_idx_dict[sridx_pr] if subrng_idx >= self.last_wf_subrng_idx: self.seq_wf_acc += wf_acc else: self.seq_wf_acc = wf_acc self.last_wf_subrng_idx = subrng_idx return ret
def test_nested_loop_desc_sanity(self): ''' Generated nested loop description sanity check. ''' batch_size = 4 for layer in self.convlayers.values() + self.fclayers.values() \ + self.lrlayers.values() + self.fake_layers.values(): ms = MapStrategyEyeriss(layer, batch_size, self.dim_array) for nld in ms.gen_nested_loop_desc(): # Replication reduces numbers of IFM/OFM. self.assertGreaterEqual(layer.nifm, nld.loopcnt[le.IFM]) self.assertGreaterEqual(layer.nofm, nld.loopcnt[le.OFM]) # Folding increases batch size. self.assertEqual(nld.loopcnt[le.BAT] % batch_size, 0) # Total and unit ops. self.assertAlmostEqual(nld.total_ops(), layer.total_ops(batch_size)) self.assertAlmostEqual(nld.unit_ops * util.prod(nld.loopcnt), layer.total_ops(batch_size)) # Unit time and unit ops. # The difference is due to the loop occupation, which is not # counted in utilization. self.assertGreaterEqual( nld.unit_time * ms.utilization() * self.dim_array.size(), nld.unit_ops) # Total access at DRAM. self.assertAlmostEqual( nld.total_access_at_of(me.DRAM, de.FIL), layer.total_filter_size() if isinstance(layer, ConvLayer) else 0) # IFM may have refetch due to folding. self.assertGreaterEqual( nld.total_access_at_of(me.DRAM, de.IFM) + 1e-7, layer.total_ifmap_size(batch_size)) self.assertAlmostEqual(nld.total_access_at_of(me.DRAM, de.OFM), layer.total_ofmap_size(batch_size)) # Unit access to REGF. self.assertAlmostEqual( nld.unit_access[me.REGF][de.FIL] * util.prod(nld.loopcnt), layer.total_ops(batch_size) if isinstance( layer, ConvLayer) else 0) self.assertAlmostEqual( nld.unit_access[me.REGF][de.IFM] * util.prod(nld.loopcnt), layer.total_ops(batch_size)) self.assertAlmostEqual( nld.unit_access[me.REGF][de.OFM] * util.prod(nld.loopcnt), layer.total_ops(batch_size)) # Unit GBUF size and unit access to DRAM. self.assertTrue( all(us >= ua for us, ua in zip(nld.usize_gbuf, nld.unit_access[me.DRAM]))) # Unit REGF size. if isinstance(layer, ConvLayer): # See JSSC'17, IV. A. Dimensions Beyond 2-D in PE Array. 1). self.assertEqual(nld.usize_regf[de.FIL], layer.wfil) self.assertEqual(nld.usize_regf[de.IFM], layer.wfil) self.assertEqual(nld.usize_regf[de.OFM], 1) # Data dimension loops. if isinstance(layer, ConvLayer): self.assertEqual(nld.data_loops[de.FIL], DataDimLoops(le.IFM, le.OFM)) self.assertEqual(nld.data_loops[de.IFM], DataDimLoops(le.IFM, le.BAT)) self.assertEqual(nld.data_loops[de.OFM], DataDimLoops(le.OFM, le.BAT)) elif isinstance(layer, ConvLayer): self.assertEqual(nld.data_loops[de.FIL], DataDimLoops()) self.assertEqual(nld.data_loops[de.IFM], DataDimLoops(le.OFM, le.BAT)) self.assertEqual(nld.data_loops[de.OFM], DataDimLoops(le.OFM, le.BAT))
def _sim_access_conv(self, lbs): ''' Get data access by actually simulating and generating loops for CONV layer. ''' self.assertTrue(lbs.is_valid(), '_sim_access_conv: invalid lbs.') data_loops = lbs.nld.data_loops lpts = zip(*lbs.bl_ts) # Get buffered unit counts at each level. dram_buf_cnt_pr_list = [ tuple(util.prod(lpts[lpe]) for lpe in data_loops[dce].loops()) for dce in range(de.NUM) ] gbuf_buf_cnt_pr_list = [ tuple(util.prod(lpts[lpe][1:]) for lpe in data_loops[dce].loops()) for dce in range(de.NUM) ] regf_buf_cnt_pr_list = [ tuple(util.prod(lpts[lpe][2:]) for lpe in data_loops[dce].loops()) for dce in range(de.NUM) ] # Initialize SimBuffer. drams = [None] * de.NUM for dce, buf_cnt_pr in enumerate(dram_buf_cnt_pr_list): drams[dce] = self._SimBuffer( dce, buf_cnt_pr, lbs.nld.unit_access[me.DRAM][dce] if lbs.stored_in_gbuf[dce] else lbs.nld.unit_access[me.GBUF][dce], ) gbufs = [None] * de.NUM for dce, buf_cnt_pr in enumerate(gbuf_buf_cnt_pr_list): gbufs[dce] = self._SimBuffer( dce, buf_cnt_pr, lbs.nld.unit_access[me.GBUF][dce], bypass=(not lbs.stored_in_gbuf[dce]), ) regfs = [None] * de.NUM for dce, buf_cnt_pr in enumerate(regf_buf_cnt_pr_list): regfs[dce] = self._SimBuffer( dce, buf_cnt_pr, lbs.nld.unit_access[me.REGF][dce], ) # Already generated psum for OFM. ofm_psum = set() # Simulation. for idx_tuple in lbs.gen_index(): for dce in range(de.NUM): idx_pr = tuple(data_loops[dce].take(idx_tuple)) if dce == de.OFM: # Fetch and writeback, unless for the first time (no fetch). write = 1 read = 1 if idx_pr in ofm_psum else 0 ofm_psum.add(idx_pr) else: read = 1 write = 0 # PE. cnt_pr = (1, 1) # REGF. cnt_pr = regfs[dce].do_access(idx_pr, cnt_pr, read, write) if not any(cnt_pr): continue # GBUF. cnt_pr = gbufs[dce].do_access(idx_pr, cnt_pr, read, write) if not any(cnt_pr): continue # DRAM. cnt_pr = drams[dce].do_access(idx_pr, cnt_pr, read, write) if not any(cnt_pr): continue dram_access = [drams[dce].access_size() for dce in range(de.NUM)] gbuf_access = [gbufs[dce].access_size() for dce in range(de.NUM)] return dram_access, gbuf_access
def test_limits(self): ''' Check limits. ''' for fs in util.factorize(1024, 3, limits=(10, 20)): self.assertLessEqual(fs[0], 10) self.assertLessEqual(fs[1], 20) self.assertEqual(util.prod(fs), 1024)
def test_empty(self): ''' Empty. ''' self.assertEqual(util.prod([]), 1) self.assertEqual(util.prod(tuple()), 1) self.assertEqual(util.prod(set()), 1)
def test_float(self): ''' Float. ''' self.assertAlmostEqual(util.prod([1.1, 2, 3]), 6.6) self.assertAlmostEqual(util.prod([1.1, 2, -3.]), -6.6)